In [1]:
import gym
import numpy as np
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers

# фиксируем сид
np.random.seed(17)
tf.random.set_seed(17)

import warnings

warnings.simplefilter('ignore')

# GPUs = tf.config.list_physical_devices('GPU')
# tf.config.experimental.set_memory_growth(GPUs[0], True)

In [2]:
env = gym.make("CartPole-v1") 

In [3]:
env.observation_space

Box([-4.8000002e+00 -3.4028235e+38 -4.1887903e-01 -3.4028235e+38], [4.8000002e+00 3.4028235e+38 4.1887903e-01 3.4028235e+38], (4,), float32)

In [4]:
env.action_space

Discrete(2)

In [5]:
num_inputs = 4  
num_actions = 2 

num_hidden = 64

inputs = layers.Input(shape=(num_inputs,))
common1 = layers.Dense(num_hidden, activation="relu")(inputs)
common2 = layers.Dense(num_hidden, activation="relu")(common1)
action = layers.Dense(num_actions, activation="softmax")(common2)
critic = layers.Dense(1)(common2)

model = keras.Model(inputs=inputs, outputs=[action, critic])


In [6]:
optimizer = keras.optimizers.legacy.Adam(learning_rate=0.01)
loss = keras.losses.Huber()

In [7]:
model.summary()

Model: "model"
__________________________________________________________________________________________________
 Layer (type)                   Output Shape         Param #     Connected to                     
 input_1 (InputLayer)           [(None, 4)]          0           []                               
                                                                                                  
 dense (Dense)                  (None, 64)           320         ['input_1[0][0]']                
                                                                                                  
 dense_1 (Dense)                (None, 64)           4160        ['dense[0][0]']                  
                                                                                                  
 dense_2 (Dense)                (None, 2)            130         ['dense_1[0][0]']                
                                                                                              

In [8]:
gamma = 0.995

max_steps_per_episode = 10000

eps = np.finfo(np.float32).eps.item()

action_probs_history = []
critic_value_history = []
rewards_history = []

running_reward = 0
episode_count = 0

while True:
    state = env.reset(seed=17)
    state = state[0]
    episode_reward = 0
    
    with tf.GradientTape() as tape:
        for timestep in range(1, max_steps_per_episode):
            env.render()
            state = tf.convert_to_tensor(state)
            state = tf.expand_dims(state, 0)
            
            action_probs, critic_value = model(state)
            
            critic_value_history.append(critic_value[0, 0])

            action = np.random.choice(num_actions, p=np.squeeze(action_probs))
            
            action_probs_history.append(tf.math.log(action_probs[0, action]))

            state, reward, done, truncated, info = env.step(action)

            rewards_history.append(reward)
            
            episode_reward += reward

            if done:
                break

        running_reward = 0.05 * episode_reward + (1 - 0.05) * running_reward

        returns = []
        discounted_sum = 0
        for r in rewards_history[::-1]:
            discounted_sum = r + gamma * discounted_sum
            returns.insert(0, discounted_sum)

        returns = np.array(returns)
        returns = (returns - np.mean(returns)) / (np.std(returns) + eps)
        returns = returns.tolist()

        history = zip(action_probs_history, critic_value_history, returns)
        actor_losses = []
        critic_losses = []
        for log_prob, value, ret in history:
            
            diff = ret - value
            actor_losses.append(-log_prob * diff)  # actor loss

            critic_losses.append(
                loss(tf.expand_dims(value, 0), tf.expand_dims(ret, 0))
            )

        loss_value = sum(actor_losses) + sum(critic_losses)
        grads = tape.gradient(loss_value, model.trainable_variables)
        optimizer.apply_gradients(zip(grads, model.trainable_variables))

        action_probs_history.clear()
        critic_value_history.clear()
        rewards_history.clear()

    episode_count += 1
    
    if episode_count % 10 == 0:
        print(f"Episode: {episode_count}, running reward: {running_reward:.2f}")

    if running_reward > 195:
        print(f"Solved at episode {episode_count}!")
        break

env.close()

Episode: 10, running reward: 8.75
Episode: 20, running reward: 12.45
Episode: 30, running reward: 15.14
Episode: 40, running reward: 13.79
Episode: 50, running reward: 13.58
Episode: 60, running reward: 14.32
Episode: 70, running reward: 15.23
Episode: 80, running reward: 14.28
Episode: 90, running reward: 15.10
Episode: 100, running reward: 16.55
Episode: 110, running reward: 16.36
Episode: 120, running reward: 16.07
Episode: 130, running reward: 18.55
Episode: 140, running reward: 30.46
Episode: 150, running reward: 32.53
Episode: 160, running reward: 34.77
Episode: 170, running reward: 39.02
Episode: 180, running reward: 38.96
Episode: 190, running reward: 29.16
Episode: 200, running reward: 23.60
Episode: 210, running reward: 19.51
Episode: 220, running reward: 17.07
Episode: 230, running reward: 15.36
Episode: 240, running reward: 19.01
Episode: 250, running reward: 23.60
Episode: 260, running reward: 28.36
Episode: 270, running reward: 34.11
Episode: 280, running reward: 34.38
Ep

Посмотрим теперь, как натренированная нейросеть играет с нашей платформой.

In [9]:
max_test_timestaps = 100000
test_episodes = 5

for ep in range(test_episodes):
    state = env.reset()
    state = state[0]
    for timestap in range(max_test_timestaps):
      
        env.render()
        
        state = tf.convert_to_tensor(state)
        state = tf.expand_dims(state, 0)

        action_probs, _ = model(state)

        # здесь уже можно выбирать конкретное действие
        if action_probs[0][0].numpy() >= 0.5:
            action = 0
        else:
            action = 1

        state, _, done, _, _ = env.step(action)

        if done:
            break

env.close()