In [1]:
#install gymnasium-robotics
!pip -q install gymnasium-robotics

In [2]:
#install stable-baselines3 for RL
!pip -q install stable-baselines3

In [1]:
#import dependencies
import gymnasium as gym
from stable_baselines3 import SAC
from stable_baselines3.common.vec_env import VecFrameStack
from stable_baselines3.common.evaluation import evaluate_policy
from stable_baselines3.common.env_util import make_vec_env
from stable_baselines3.common.vec_env import SubprocVecEnv
from stable_baselines3.common.vec_env import VecFrameStack
import os


In [2]:
#make training and log directories
!mkdir Training
!mkdir Training/Logs


mkdir: cannot create directory ‘Training’: File exists
mkdir: cannot create directory ‘Training/Logs’: File exists


In [1]:
#make saved models directory
!mkdir Training/Saved
!mkdir Training/Saved/SAC_model

mkdir: cannot create directory ‘Training/Saved’: File exists


In [2]:
#set paths
log_path = os.path.join('Training', 'Logs')
sac_path = os.path.join('Training', 'Saved')

In [7]:
#make environment for training
env = make_vec_env("Humanoid-v4", n_envs=1, vec_env_cls=SubprocVecEnv)
env = VecFrameStack(env, n_stack=1)



In [6]:
#make environment for testing
env = gym.make('Humanoid-v4',render_mode='human')

In [8]:
#setup model using A2C algorithm
model = SAC("MlpPolicy", env, device="cpu",verbose=1, tensorboard_log=log_path)

Using cpu device


In [11]:
#load saved model for testing
model = SAC.load(os.path.join('Training', 'Saved', 'SAC_model.zip'))


In [5]:
#train model
model.learn(total_timesteps=500000)

Logging to Training/Logs/SAC_1
---------------------------------
| rollout/           |          |
|    ep_len_mean     | 32.2     |
|    ep_rew_mean     | 162      |
| time/              |          |
|    episodes        | 4        |
|    fps             | 316      |
|    time_elapsed    | 0        |
|    total_timesteps | 129      |
| train/             |          |
|    actor_loss      | -13.1    |
|    critic_loss     | 132      |
|    ent_coef        | 0.994    |
|    ent_coef_loss   | -0.156   |
|    learning_rate   | 0.0003   |
|    n_updates       | 28       |
---------------------------------
---------------------------------
| rollout/           |          |
|    ep_len_mean     | 29.5     |
|    ep_rew_mean     | 147      |
| time/              |          |
|    episodes        | 8        |
|    fps             | 129      |
|    time_elapsed    | 1        |
|    total_timesteps | 236      |
| train/             |          |
|    actor_loss      | -27.6    |
|    critic_loss 

<stable_baselines3.sac.sac.SAC at 0x7fda2dab55b0>

In [6]:
#save model
model.save(sac_path)

In [7]:
#evaluate model
evaluate_policy(model, env, n_eval_episodes=10, render=True)

(5267.9949601, 185.72534757461008)

In [12]:
#observe model performance
obs = env.reset()[0]
score = 0
steps = 500
while True:
    action, _states = model.predict(obs)
    obs, rewards, trun ,dones, info = env.step(action)
    env.render()
    score+=rewards
    print('Score:{}'.format(score))
    if dones:
        steps -= 1
        print('info', info)
        if steps <= 0:
            break
        
            
        
env.close()

Score:4.821864019210459
Score:9.72051341457307
Score:14.554526046521385
Score:19.462169133421618
Score:24.32714871994052
Score:29.191215359577654
Score:34.05045225589962
Score:38.89363225415944
Score:43.74598570962666
Score:48.6385609502849
Score:53.55294103021855
Score:58.474108058198624
Score:63.370575353604806
Score:68.22375782106896
Score:73.013612600252
Score:77.86668892711553
Score:82.69475302997382
Score:87.51912825975768
Score:92.37316343691607
Score:97.21525879187197
Score:102.02766634092174
Score:106.88235889196841
Score:111.72861810063935
Score:116.5816926518788
Score:121.44978724402493
Score:126.33789323929739
Score:131.24790096166205
Score:136.17409535211874
Score:141.0189263070498
Score:145.8781978240469
Score:150.7499163507687
Score:155.66916192202294
Score:160.64566949436227
Score:165.6036221600772
Score:170.54303601116231
Score:175.44895318158126
Score:180.35878517639185
Score:185.25636549637235
Score:190.1816637608382
Score:195.1601546300142
Score:200.21911815165686
S

In [9]:
#load to continue training
model = SAC.load(os.path.join('Training', 'Saved', 'SAC_model.zip'), tensorboard_log=log_path,reset_num_timesteps=False)
model.set_env(env)

In [10]:
#continue training
model.learn(total_timesteps=100000)

Logging to Training/Logs/SAC_2
---------------------------------
| rollout/           |          |
|    ep_len_mean     | 21.2     |
|    ep_rew_mean     | 107      |
| time/              |          |
|    episodes        | 4        |
|    fps             | 1582     |
|    time_elapsed    | 0        |
|    total_timesteps | 85       |
---------------------------------
---------------------------------
| rollout/           |          |
|    ep_len_mean     | 147      |
|    ep_rew_mean     | 776      |
| time/              |          |
|    episodes        | 8        |
|    fps             | 80       |
|    time_elapsed    | 14       |
|    total_timesteps | 1173     |
| train/             |          |
|    actor_loss      | -384     |
|    critic_loss     | 37       |
|    ent_coef        | 0.0384   |
|    ent_coef_loss   | 24.3     |
|    learning_rate   | 0.0003   |
|    n_updates       | 500972   |
---------------------------------
---------------------------------
| rollout/       

<stable_baselines3.sac.sac.SAC at 0x7fdfbe3ad730>

In [11]:
#save model
model.save(sac_path)