<a href="https://colab.research.google.com/github/budennovsk/AuthorBooksComments/blob/master/%D0%9F%D0%BE%D0%B4%D0%B3%D0%BE%D1%82%D0%BE%D0%B2%D0%BA%D0%B0_%D1%81%D0%BE%D0%B1%D0%B5%D1%81_1.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import gymnasium as gym
import numpy as np
import tensorflow as tf
from tensorflow.keras import models, layers, optimizers
from collections import deque
import random
import matplotlib.pyplot as plt  # Визуализация

# Создание нейронной сети для аппроксимации Q-функции
def build_model(state_size, action_size):
    model = models.Sequential()
    model.add(layers.Input(shape=(state_size,)))
    model.add(layers.Dense(24, activation='relu'))
    model.add(layers.Dense(24, activation='relu'))
    model.add(layers.Dense(action_size, activation='linear'))
    model.compile(loss='mse', optimizer=optimizers.Adam(learning_rate=0.001))
    return model

# Эпсилон-жадная стратегия для выбора действий
def choose_action(state, model, epsilon, action_size):
    if np.random.rand() <= epsilon:
        return random.randrange(action_size)  # Случайное действие
    q_values = model.predict(state, verbose=0)
    return np.argmax(q_values[0])  # Лучшая стратегия

# Функция переобучения на основе опыта
def replay():
    if len(memory) < batch_size:
        return
    batch = random.sample(memory, batch_size)
    for state, action, reward, next_state, done in batch:
        target = reward
        if not done:
            target += gamma * np.amax(model.predict(next_state, verbose=0)[0])
        q_values = model.predict(state, verbose=0)
        q_values[0][action] = target
        model.fit(state, q_values, epochs=1, verbose=0)

# Инициализация среды и параметров
env = gym.make('CartPole-v1', render_mode='human')  # Заменено на Gymnasium
state_size = env.observation_space.shape[0]
action_size = env.action_space.n

epsilon = 1.0  # Вероятн��сть исследования (exploration)
epsilon_min = 0.01
epsilon_decay = 0.995
gamma = 0.95  # Коэффициент дисконтирования
batch_size = 32

memory = deque(maxlen=2000)  # Для хранения опыта
model = build_model(state_size, action_size)

# Для визуализации
rewards = []
episodes = 5

for episode in range(episodes):
    state, _ = env.reset()
    state = np.reshape(state, [1, state_size])
    total_reward = 0

    for time in range(500):
        action = choose_action(state, model, epsilon, action_size)
        next_state, reward, terminated, truncated, _ = env.step(action)
        done = terminated or truncated
        next_state = np.reshape(next_state, [1, state_size])

        memory.append((state, action, reward, next_state, done))
        state = next_state
        total_reward += reward

        if done:
            rewards.append(total_reward)  # Сохраняем суммарную награду для построения графика
            print(f"Episode: {episode}, Reward: {total_reward}, Epsilon: {epsilon:.2f}")
            break

    if epsilon > epsilon_min:
        epsilon *= epsilon_decay

    replay()

# Построение графика
plt.plot(rewards)
plt.xlabel('Эпизод')
plt.ylabel('Вознаг��аждение')
plt.title('График суммарного вознаграждения по эпизодам')
plt.show()

env.close()

In [None]:
from gymnasium.wrappers import RecordVideo

env = gym.make('CartPole-v1', render_mode='rgb_array')
env = RecordVideo(env, "recordings/", episode_trigger=lambda x: x % 2 == 0)  # Записи сохраняются каждые 50 эпизодов