In [2]:
pip install gym[box2d]

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting pygame==2.1.0
  Downloading pygame-2.1.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (18.3 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m18.3/18.3 MB[0m [31m35.9 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting box2d-py==2.3.5
  Downloading box2d-py-2.3.5.tar.gz (374 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m374.4/374.4 kB[0m [31m26.4 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting swig==4.*
  Downloading swig-4.1.1-py2.py3-none-manylinux_2_5_x86_64.manylinux1_x86_64.whl (1.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.8/1.8 MB[0m [31m53.6 MB/s[0m eta [36m0:00:00[0m
[?25hBuilding wheels for collected packages: box2d-py
  [1;31merror[0m: [1msubprocess-exited-with-error[0m
  
  [31m×[0m [32mpython setup.py bdist_wheel[0m did no

In [2]:
import tensorflow as tf
import numpy as np
from collections import deque
import gym
import random

env = gym.make('CarRacing-v2')
state_size = env.observation_space.shape
action_size = env.action_space.shape[0]

class ActorCritic:
    def __init__(self, state_size, action_size):
        self.state_size = state_size
        self.action_size = action_size
        self.memory = deque(maxlen=2000)
        self.gamma = 0.95
        self.actor_lr = 0.001
        self.critic_lr = 0.005
        self.actor = self.build_actor()
        self.critic = self.build_critic()

    def build_actor(self):
        inputs = tf.keras.layers.Input(shape=self.state_size)
        x = tf.keras.layers.Conv2D(32, (8, 8), strides=(4, 4), activation='relu')(inputs)
        x = tf.keras.layers.Conv2D(64, (4, 4), strides=(2, 2), activation='relu')(x)
        x = tf.keras.layers.Conv2D(64, (3, 3), strides=(1, 1), activation='relu')(x)
        x = tf.keras.layers.Flatten()(x)
        x = tf.keras.layers.Dense(512, activation='relu')(x)
        outputs = tf.keras.layers.Dense(self.action_size, activation='tanh')(x)
        model = tf.keras.models.Model(inputs=inputs, outputs=outputs)
        model.compile(loss='mse', optimizer=tf.keras.optimizers.Adam(lr=self.actor_lr))
        return model

    def build_critic(self):
        inputs = tf.keras.layers.Input(shape=self.state_size)
        x = tf.keras.layers.Conv2D(32, (8, 8), strides=(4, 4), activation='relu')(inputs)
        x = tf.keras.layers.Conv2D(64, (4, 4), strides=(2, 2), activation='relu')(x)
        x = tf.keras.layers.Conv2D(64, (3, 3), strides=(1, 1), activation='relu')(x)
        x = tf.keras.layers.Flatten()(x)
        x = tf.keras.layers.Dense(512, activation='relu')(x)
        outputs = tf.keras.layers.Dense(1, activation='linear')(x)
        model = tf.keras.models.Model(inputs=inputs, outputs=outputs)
        model.compile(loss='mse', optimizer=tf.keras.optimizers.Adam(lr=self.critic_lr))
        return model

    def remember(self,state,action,reward,next_state,done):
      self.memory.append((state,action,reward,next_state,done))

    def act(self,state):
      state=np.expand_dims(state,axis=0)
      action=self.actor.predict(state)[0]
      return action

    def train(self):
      batch_size=32
      if len(self.memory)<batch_size:
          return
      samples=random.sample(self.memory,batch_size)

      states=np.array([i[0] for i in samples])
      actions=np.vstack([i[1] for i in samples])
      rewards=np.array([i[2] for i in samples])
      next_states=np.array([i[3] for i in samples])

      targets=rewards+self.gamma*self.critic.predict(next_states)[:,0]

      target_vec=self.critic.predict(states)[:,0]

      advantages=targets-target_vec

      self.actor.fit(states,[advantages],epochs=1,batch_size=batch_size,sample_weight=actions[:,0],verbose=0)

      target_vec=targets

      self.critic.fit(states,target_vec.reshape(-1,1),epochs=1,batch_size=batch_size)

agent=ActorCritic(state_size,action_size)

EPISODES=100

total_rewards_per_episode=[]

for e in range(EPISODES):
    state=env.reset()
    total_reward=0
    
    while True:
        
      action=agent.act(state)
      next_state,reward,done,_=env.step(action)
      agent.remember(state,action,reward,next_state,done)
      agent.train()
      state=next_state
      total_reward+=reward
      
      if done:
          break
    
    total_rewards_per_episode.append(total_reward)
    print('Episode = ', e, ' | Reward = ', total_reward)

import matplotlib.pyplot as plt
plt.plot(total_rewards_per_episode)
plt.title('Total Reward per Episode')
plt.xlabel('Episode')
plt.ylabel('Total Reward')
plt.show()
plt.savefig('Q-Actor-Critic-Car-Racing.png')

In [None]:
EPISODES = 10

total_rewards_per_episode = []

for e in range(EPISODES):
    state = env.reset()
    total_reward = 0
    
    while True:
        state = np.expand_dims(state, axis=0)
        action = agent.actor.predict(state)[0]
        next_state, reward, done, _ = env.step(action)
        state = next_state
        total_reward += reward
        
        if done:
            break
    
    total_rewards_per_episode.append(total_reward)

import matplotlib.pyplot as plt
plt.plot(total_rewards_per_episode)
plt.title('Total Reward per Episode')
plt.xlabel('Episode')
plt.ylabel('Total Reward')
plt.show()
plt.savefig('Q-Actor-Critic-Car-Racing-Testing.png')