# <center>Table of Contents</center>

### 1. **Import Libraries**  
   - 1A. [Import Required Libraries](#1a-import-required-libraries)  
   - 1B. [Create Environment and Test](#1b-create-environment-and-test)  

### 2. **Train Model for Normal Version with PPO**  
   - 2A. [Train the Model](#2a-train-the-model)  
   - 2B. [Save the Model](#2b-save-the-model)  
   - 2C. [Evaluate the Model](#2c-evaluate-the-model)  

### 3. **Train Model for Hardcore Version with PPO**  
   - 3A. [Test the Environment](#3a-test-the-environment)  
   - 3B. [Train the Hardcore Model](#3b-train-the-hardcore-model)  
   - 3C. [Save the Hardcore Model](#3c-save-the-hardcore-model)  
   - 3D. [Evaluate the Hardcore Model](#3d-evaluate-the-hardcore-model)  


# <center>1. Import Libaries</center>

## 1A) Import Libaries

In [None]:
# Import the necessary libraries

# gymnasium is a modern version of the gym library, used to create and interact with reinforcement learning environments
import gymnasium as gym

# Import PPO (Proximal Policy Optimization) from stable-baselines3, which is a popular reinforcement learning algorithm
from stable_baselines3 import PPO

# Import the evaluation function to assess the performance of the trained policy
from stable_baselines3.common.evaluation import evaluate_policy

# Import Monitor to log training information such as rewards and episode lengths
from stable_baselines3.common.monitor import Monitor

# Import utility functions for vectorized environments, normalization, frame stacking, and video recording
from stable_baselines3.common.vec_env import DummyVecEnv, VecNormalize, VecFrameStack, VecVideoRecorder

# Import os for handling directory creation and file paths
import os 

# Import pandas for handling and analyzing data (e.g., log files)
import pandas as pd

## 1B) Create Env and Test

In [None]:
# Create the BipedalWalker environment with human-rendering mode enabled
env = gym.make("BipedalWalker-v3", render_mode="human")

In [None]:
# Reset the environment (start a new episode) - without using seed or options
obs = env.reset()

# Let the agent take random actions for 1000 steps
for _ in range(1000):
    # Take a random action sampled from the environment's action space
    action = env.action_space.sample()
    
    # Step the environment forward using the chosen action
    # The environment returns the new observation (obs), the reward, 
    # whether the episode is done (done), if it was truncated (truncated), and additional info (info)
    obs, reward, done, truncated, info = env.step(action)
    
    # If the episode is finished (either done or truncated), reset the environment for a new episode
    if done or truncated:
        obs = env.reset()

# Close the environment when finished to clean up resources
env.close()

# <center>2) Train Model for Normal Version with PPO</center>

## 2A) Preprocces Enviorment

In [None]:
env = gym.make("BipedalWalker-v3") #,render_mode = 'rgb_array') 

In [None]:
# Define the logs directory and create it if it doesn't exist
logs_dir = 'logs'
os.makedirs(logs_dir, exist_ok=True)

# Specify the log filename (change this if needed)
log_filename = ""  # You can change this manually if needed. Default 'monitor.csv', if you add a text it wil be
                    #import as (text).monitor.csv

# Define the path for the monitor log
monitor_log_path = os.path.join(logs_dir, log_filename)

# Wrap the environment with Monitor and save logs to the defined path
env = Monitor(env, filename=monitor_log_path)

In [None]:
# Wrap the environment in a DummyVecEnv to enable vectorized operations
env = DummyVecEnv([lambda: env])

# Normalize observations and rewards in the environment
# norm_obs: Normalize observations
# norm_reward: Normalize rewards
# clip_obs: Clip the observation values to prevent outliers
env = VecNormalize(env, norm_obs=True, norm_reward=True, clip_obs=10.)

# Stack the last n_stack observations (here n_stack=4) to provide temporal information to the agent
env = VecFrameStack(env, n_stack=4)

In [None]:
video_folder = 'videos'
os.makedirs(video_folder, exist_ok=True)

env = VecVideoRecorder(env, video_folder, record_video_trigger=lambda x: x % 1000 == 0, video_length=200)

## 2B) Train Model

In [None]:
# Create the PPO model with a Multi-Layer Perceptron (MLP) policy
model = PPO("MlpPolicy", env, verbose=1)

In [None]:
model.learn(total_timesteps=1000000)

## 2C) Save Model

In [None]:
model.save("ppo_bipedalwalker_1M")

In [None]:
del model

## 2D) Evaluate Model

In [None]:
model = PPO.load("ppo_bipedalwalker")

In [None]:
env = gym.make("BipedalWalker-v3", render_mode="human")

In [None]:
# Evaluate the model (e.g., over 10 episodes)
mean_reward, std_reward = evaluate_policy(model, env, n_eval_episodes=10)

print(f"Average reward: {mean_reward} ± {std_reward}")

# <center>3) Train Model for Hardcore Version with PPO</center>

## 3A) Test Enviroment

In [None]:
env = gym.make("BipedalWalker-v3", hardcore=True, render_mode="human")

In [None]:
# Reset the environment (start a new episode) - without using seed or options
obs = env.reset()

# Let the agent take random actions for 1000 steps
for _ in range(1000):
    # Take a random action sampled from the environment's action space
    action = env.action_space.sample()
    
    # Step the environment forward using the chosen action
    # The environment returns the new observation (obs), the reward, 
    # whether the episode is done (done), if it was truncated (truncated), and additional info (info)
    obs, reward, done, truncated, info = env.step(action)
    
    # If the episode is finished (either done or truncated), reset the environment for a new episode
    if done or truncated:
        obs = env.reset()

# Close the environment when finished to clean up resources
env.close()

## 3B) Preprocces Enviorment

In [None]:
env = gym.make("BipedalWalker-v3", hardcore=True) #,render_mode = 'rgb_array')

In [None]:
# Define the logs directory and create it if it doesn't exist
logs_dir = 'logs'
os.makedirs(logs_dir, exist_ok=True)

# Specify the log filename (change this if needed)
log_filename = ""  # You can change this manually if needed. Default 'monitor.csv', if you add a text it wil be
                    #import as (text).monitor.csv

# Define the path for the monitor log
monitor_log_path = os.path.join(logs_dir, log_filename)

# Wrap the environment with Monitor and save logs to the defined path
env = Monitor(env, filename=monitor_log_path)

In [None]:
# Wrap the environment in a DummyVecEnv to enable vectorized operations
env = DummyVecEnv([lambda: env])

# Normalize observations and rewards in the environment
# norm_obs: Normalize observations
# norm_reward: Normalize rewards
# clip_obs: Clip the observation values to prevent outliers
env = VecNormalize(env, norm_obs=True, norm_reward=True, clip_obs=10.)

# Stack the last n_stack observations (here n_stack=4) to provide temporal information to the agent
env = VecFrameStack(env, n_stack=4)

In [None]:
# Define the video folder and create it if it doesn't exist
video_folder = 'videos'
os.makedirs(video_folder, exist_ok=True)

# Wrap the environment with VecVideoRecorder to record videos
# The recording is triggered every 1000 steps and each video will be 200 steps long
env = VecVideoRecorder(env, video_folder, record_video_trigger=lambda x: x % 1000 == 0, video_length=200)

## 3C) Train Model

In [None]:
# Create the PPO model with a Multi-Layer Perceptron (MLP) policy
model = PPO("MlpPolicy", env, verbose=1)

In [None]:
model.learn(total_timesteps=2000000)

## 3D) Save Model

In [None]:
model.save("ppo_bipedalwalker_hardcore_3M")

In [None]:
del model

## 3E) Evaluate Model

In [None]:
model = PPO.load("ppo_bipedalwalker_hardcore_3M")

In [None]:
env = gym.make("BipedalWalker-v3", hardcore=True, render_mode="human")

In [None]:
# Evaluate the model (e.g., over 10 episodes)
mean_reward, std_reward = evaluate_policy(model, env, n_eval_episodes=10)

print(f"Average reward: {mean_reward} ± {std_reward}")