In [3]:
!pip install pyflyt
!pip uninstall stable-baselines3 sb3_contrib -y
!pip install stable-baselines3 sb3_contrib



In [11]:
# !rm -r *

In [9]:
from PyFlyt.core.drones import Rocket
import numpy as np

import gymnasium
import gymnasium as gym
import PyFlyt.gym_envs

from stable_baselines3 import DQN, PPO
from stable_baselines3.common.evaluation import evaluate_policy
from stable_baselines3.common.policies import ActorCriticPolicy
from stable_baselines3.common.vec_env import SubprocVecEnv, DummyVecEnv
from stable_baselines3.common.callbacks import CheckpointCallback, EvalCallback
from stable_baselines3.common.monitor import Monitor
from stable_baselines3.common.callbacks import BaseCallback
import time
import torch.nn as nn

In [5]:
class TrainingMetricsCallback(BaseCallback):
    def __init__(self, check_freq, verbose=1):
        super(TrainingMetricsCallback, self).__init__(verbose)
        self.check_freq = check_freq
        self.best_mean_reward = -np.inf
        self.episode_rewards = []
        self.start_time = time.time()

    def _on_step(self) -> bool:
        if self.n_calls % self.check_freq == 0:
            episode_rewards = self.locals['rewards']
            mean_reward = np.mean(episode_rewards)
            stddev_reward = np.std(episode_rewards)
            if self.verbose > 0:
                print(f"Mean reward: {mean_reward:.2f} +/- {stddev_reward:.2f}")
            self.logger.record('runtime/mean_reward', mean_reward)
            self.logger.record('runtime/stddev_reward', stddev_reward)
            
            # Update best mean reward
            if mean_reward > self.best_mean_reward:
                self.best_mean_reward = mean_reward
                self.logger.record('runtime/best_mean_reward', self.best_mean_reward)

            self.episode_rewards.append(mean_reward)
        
        return True

    def _on_training_end(self):
        total_time = time.time() - self.start_time
        self.logger.record('runtime/total_time', total_time)
        self.logger.record('runtime/total_episodes', len(self.episode_rewards))
        print(f"Total training time: {total_time:.2f} seconds")
        print(f"Total number of episodes: {len(self.episode_rewards)}")

In [6]:
def make_env():
    def _init():
        env = gym.make("PyFlyt/Rocket-Landing-v1")
        env = Monitor(env, "./monitor_train_logs")  # Monitor for tracking metrics
#         env = FrameSkip(env, skip=4)
#         env = HistoryWrapper(env, horizon=2)
        return env
    return _init

In [12]:
# Create 16 parallel training environments using SubprocVecEnv
n_envs = 800
train_env = DummyVecEnv([make_env() for _ in range(n_envs)])  # Parallel environments

# Create the evaluation environment
eval_env = Monitor(gymnasium.make("PyFlyt/Rocket-Landing-v1"), "./monitor_eval_logs")
# Configure EvalCallback
eval_callback = EvalCallback(
    eval_env,  # Use the evaluation environment
    best_model_save_path='./best_models/',  # Save best models here
    log_path='./eval_logs/',  # Path for evaluation logs
    eval_freq=5000,  # Evaluate every 5000 steps
    deterministic=True,  # Use deterministic policy for evaluation
    render=False,  # No rendering during evaluation
)

# Define a CheckpointCallback for model saving
checkpoint_callback = CheckpointCallback(
    save_freq=5000,
    save_path="./checkpoints/",
    name_prefix="ppo_model",
)

# Policy keyword arguments and PPO parameters
policy_kwargs = {   
    "ortho_init": False,
    "activation_fn": nn.ReLU,
    "net_arch": {
        "pi" : [256, 256],
        "vf" : [256, 256]
    },  # Neural network architecture
}

ppo_params = {
    "tensorboard_log": "./",
    "policy_kwargs": policy_kwargs,
    "learning_rate": 1e-4,  # Learning rate
    "clip_range": 0.2,
    "batch_size": 128,
    "n_steps": 1024,
    "gamma": 0.99,
    "gae_lambda": 0.95,
    "n_epochs": 5,
    "ent_coef": 1e-4,
    "max_grad_norm": 0.7,
}


In [13]:
model = PPO("MlpPolicy", train_env, verbose=1, **ppo_params)
model.learn(total_timesteps=int(1e7),
            callback=[checkpoint_callback, eval_callback, TrainingMetricsCallback(check_freq=1000)])


Using cuda device
[A                             [A
[A                             [A
argv[0]=
argv[0]=
[A                             [A
[A                             [A
argv[0]=
argv[0]=
[A                             [A
[A                             [A
argv[0]=
argv[0]=
[A                             [A
[A                             [A
argv[0]=
argv[0]=
[A                             [A
argv[0]=
argv[0]=
[A                             [A
[A                             [A
argv[0]=
argv[0]=
[A                             [A
[A                             [A
argv[0]=
argv[0]=
[A                             [A
[A                             [A
argv[0]=
argv[0]=
[A                             [A
[A                             [A
argv[0]=
[A                             [A
[A                             [A
argv[0]=
argv[0]=
[A                             [A
[A                             [A
argv[0]=
argv[0]=
[A                             [A
a

KeyboardInterrupt: 

In [14]:
model.save("ppo_rocket_landing_v7")

In [14]:
full_paths = [os.path.join("checkpoints", file) for file in os.listdir("checkpoints")]
sorted_files = sorted(full_paths, key=os.path.getmtime)
sorted_files[-1]

'checkpoints/ppo_model_8000000_steps.zip'

In [15]:
import os
saved_files = [int(files.split("_")[2]) for files in os.listdir("checkpoints")]

In [16]:
saved_files.sort()

In [19]:
import shutil
file_name = 'checkpoints/ppo_model_9840000_steps.zip'
shutil.copy(file_name, "./")


'./ppo_model_9840000_steps.zip'

In [None]:
# Evaluate the trained model
mean_reward, std_reward = evaluate_policy(model, env, n_eval_episodes=10)

In [1]:
import os
os.listdir("/kaggle/working/")

['ppo_rocket_landing_v7.zip',
 'monitor_eval_logs.monitor.csv',
 'PPO_1',
 '.virtual_documents',
 'monitor_train_logs.monitor.csv']