In [10]:
# !pip install gym --no-cache-dir
# !pip install pywin32
# !pip install torch
# !pip install pygame

# !pip install pygame==2.1.0
# !pip install gym[classic_control] pygame
!pip install gym[box2d]

Collecting box2d-py==2.3.5 (from gym[box2d])
  Using cached box2d-py-2.3.5.tar.gz (374 kB)
  Installing build dependencies: started
  Installing build dependencies: finished with status 'done'
  Getting requirements to build wheel: started
  Getting requirements to build wheel: finished with status 'done'
  Installing backend dependencies: started
  Installing backend dependencies: finished with status 'done'
  Preparing metadata (pyproject.toml): started
  Preparing metadata (pyproject.toml): finished with status 'done'
Collecting pygame==2.1.0 (from gym[box2d])
  Using cached pygame-2.1.0.tar.gz (5.8 MB)
  Installing build dependencies: started
  Installing build dependencies: finished with status 'done'
  Getting requirements to build wheel: started
  Getting requirements to build wheel: finished with status 'error'


  error: subprocess-exited-with-error
  
  × Getting requirements to build wheel did not run successfully.
  │ exit code: 1
  ╰─> [75 lines of output]
      
      
      Using WINDOWS configuration...
      
      Making dir :prebuilt_downloads:
      Downloading... https://www.libsdl.org/release/SDL2-devel-2.0.16-VC.zip 13d952c333f3c2ebe9b7bc0075b4ad2f784e7584
      Unzipping :prebuilt_downloads\SDL2-devel-2.0.16-VC.zip:
      Downloading... https://www.libsdl.org/projects/SDL_image/release/SDL2_image-devel-2.0.5-VC.zip 137f86474691f4e12e76e07d58d5920c8d844d5b
      Unzipping :prebuilt_downloads\SDL2_image-devel-2.0.5-VC.zip:
      Downloading... https://www.libsdl.org/projects/SDL_ttf/release/SDL2_ttf-devel-2.0.15-VC.zip 1436df41ebc47ac36e02ec9bda5699e80ff9bd27
      Unzipping :prebuilt_downloads\SDL2_ttf-devel-2.0.15-VC.zip:
      Downloading... https://www.libsdl.org/projects/SDL_mixer/release/SDL2_mixer-devel-2.0.4-VC.zip 9097148f4529cf19f805ccd007618dec280f0ecc
      Unzipping :

In [11]:
import gym
import torch
from tqdm import tqdm
import matplotlib.pyplot as plt
from gym.utils.play import play

In [12]:
env = gym.make('LunarLander-v2')

DependencyNotInstalled: box2D is not installed, run `pip install gym[box2d]`

In [None]:
play(env, keys_to_action={(ord('w'),): 2, (ord('a'),):1, (ord('d'),):3})
env.close()

In [None]:
# Функция для прогона одного эпизода с заданной стратегией
def run_episode(policy, env):
    state = env.reset()
    is_done = False
    total_reward = 0
    while not is_done:
        env.render()
        action = policy.make_action(state)
        state, reward, is_done, _ = env.step(action)
        total_reward += reward
    env.close()
    return total_reward

In [None]:
# Болванка для стратегий
class Policy:
    def __init__(self, n_actions):
        pass

    def make_action(self, state):
        pass

    def update(self, state, next_state, action, reward, gamma=1):
        pass

In [None]:
# Стратегия рандомного действия
class RandomPolicy:
    def __init__(self, n_actions):
        self.n_actions = n_actions

    def make_action(self, state):
        return torch.randint(self.n_actions, (1,)).item()

In [None]:
for _ in range(4):
    print(run_episode(RandomPolicy(4), env))

In [None]:
# Агент с DQN-Стратегией
class DQNPolicy:
    def __init__(self, n_actions, station_space, n_hidden = 128, lr=0.005):
        self.n_actions = n_actions

        self.dqn = torch.nn.Sequential(
            torch.nn.Linear(station_space, n_hidden),
            torch.nn.ReLU(),
            torch.nn.Linear(n_hidden, n_actions)
        )
        self.loss = torch.nn.MSELoss()
        self.optimizer = torch.optim.Adam(self.dqn.parameters(), lr=0.001)

    def make_action(self, state, eps=0):
        with torch.no_grad():
            best_action = torch.argmax(self.dqn(torch.Tensor(state)))
            if torch.rand((1,)).item() > eps:
                return best_action.item()
            return torch.randint(self.n_actions, (1,)).item()


    def update(self, state, next_state, action, reward, gamma=1):
        q_values = self.dqn(torch.Tensor(state))
        q_values_next = self.dqn(torch.Tensor(next_state))

        q_values_should_be = self.dqn(torch.Tensor(state)).tolist().copy()
        q_values_should_be[action] = reward + gamma*torch.max(q_values_next).item()

        self.optimizer.zero_grad()
        self.loss(q_values, torch.Tensor(q_values_should_be)).backward()
        self.optimizer.step()

In [None]:
# Обучение данной политики в течении n эпизодов
def learn_policy(env, n_episodes, policy):
    total_rewards = []

    for episode in tqdm(range(n_episodes)):
        state = env.reset()
        is_done = False
        total_reward = 0
        while not is_done:
            if episode % 20 == 0:
                env.render()
            action = policy.make_action(state, eps=0.1)
            next_state, reward, is_done, _ = env.step(action)
            total_reward += reward

            policy.update(state, next_state, action, reward)
            state = next_state
        total_rewards.append(total_reward)
    env.close()
    return total_rewards

In [None]:
# Создаем агента
policy = DQNPolicy(env.action_space.n, env.observation_space.shape[0])

In [None]:
# Пока агент ничего не умеет
for _ in range(4):
    print(run_episode(policy, env))

In [None]:
# Обучаем агента на 1000 эпизодах
total_rewards = learn_policy(env, 1000, policy)

In [None]:
# Смотрим, как ведет себя агент
for _ in range(4):
    print(run_episode(policy, env))

In [None]:
# Библиотека, которая берет все на себя!

In [None]:
!pip install stable_baselines3

In [None]:
from stable_baselines3 import DQN
from stable_baselines3.common.evaluation import evaluate_policy

In [None]:
model = DQN('MlpPolicy', env, verbose=1)

In [None]:
evaluate_policy(model, env, render=True, n_eval_episodes=1)
env.close()

In [None]:
model.learn(total_timesteps=300000)

In [None]:
model.save('my_model')

In [None]:
model = DQN.load('my_model')

In [None]:
evaluate_policy(model, env, render=True, n_eval_episodes=4)
env.close()