# ML2 (CSCI 4052U) Exam: RL

Installs:
- pip install gymnasium
- pip install swig
- pip install "gymnasium[box2d]"

In [49]:
import gymnasium as gym
import numpy as np
import time
import math

Lunar Lander Page: https://gymnasium.farama.org/environments/box2d/lunar_lander/

64 possible states, 4 actions (left, down, right, up)

In [120]:



# Runs the requested amount of episodes using Q-Learning
# episodes = # of episodes to run through
# learning_rate
# discount_factor = 0 to 1, closer to 1 values future rewards highly, closer to 0 focuses on immediate rewards more
# epsilon = 0 to 1, chance of taking a random action (1 is always random, 0 is always optimal action as per q values)
# epsilon_change = how much to lower epsilon by per episode (over time max q-value should take priority and exploration minimized)
# slippery = If False, the action requested is always followed through on. If True, the action requested is followed through on 1/3 of the time,
#            and the two perpendicular actions are taken 1/3 of the time each (ex. request=left (1/3 chance), 1/3 chance of slipping up, 1/3 of down)
# render = None for no visualization, "Human" to see visualization

def run_episodes(episodes, learning_rate=0.9, discount_factor=0.9, epsilon=1, epsilon_change=0.01, slippery=True, render=None, debug=False):
    env = gym.make("FrozenLake-v1", map_name="8x8", is_slippery=slippery, render_mode=render)

    # 64 states (0 to 63) and 4 actions (0 = left, 1 = down, 2 = right, 3 = up)
    q = np.zeros((env.observation_space.n, env.action_space.n)) # q-value storage
    rng = np.random.default_rng() # random number from 0 to 1 (to determine if random action should be taken)
    completions = np.full(episodes,False)
    checkpoints = math.floor(episodes/10) # Print statement at 10% completion intervals

    for _ in range(episodes):
        state, info = env.reset()

        if debug:
            print("Episode", _, " , Epsilon:", epsilon)
        elif (_+1)%checkpoints==0:
            print("Episode", _, " , Epsilon:", round(epsilon,3), " | Completions so Far:", completions.sum(), " | Success Rate so Far:", round(completions.sum()/_,3)*100,"%")
            

        while True:

            if rng.random() < epsilon:
                action = env.action_space.sample() # Random action
            else:
                action = np.argmax(q[state,:])

            # new_state: After taking the action calculated above, what position are we now in? (0-63)
            # reward: The reward for taking that action (reach goal = +1, reach hole/frozen = 0)
            # terminated: True if the player moves into a hole OR the player reaches the goal
            # truncation: True if the limit (length of episode) is reached, this is 200 for 8x8 env
            # info: number from 0 to 1 with odds of taking the action requested (1/3 if is_slippery, 1 otherwise)
            new_state, reward, terminated, truncated, info = env.step(action)

            if reward == 1:
                completions[_] = True

            q[state,action] = q[state,action] + learning_rate * (reward + discount_factor * max(q[new_state,:]) -q[state,action])

            state = new_state

            if terminated or truncated:
                if render == "Human":
                    time.sleep(0.05)
                break

        epsilon -= epsilon_change # Lower Epsilon by specified amount
        if epsilon < 0:
            epsilon = 0

    time.sleep(0.5)
    env.close()
    print("\nSimple Breakdown:")
    print("Episodes:", episodes)
    print("Successful Episodes:", completions.sum())
    print("Failed Episodes:", (episodes-completions.sum()))
    print("Success Rate:", round(((completions.sum())/(episodes))*100,3), "%")

    

In [123]:
# render="Human" to visualize
run_episodes(70000, epsilon_change=0.00002, slippery=False)

Episode 6999  , Epsilon: 0.86  | Completions so Far: 38  | Success Rate so Far: 0.5 %
Episode 13999  , Epsilon: 0.72  | Completions so Far: 125  | Success Rate so Far: 0.8999999999999999 %
Episode 20999  , Epsilon: 0.58  | Completions so Far: 398  | Success Rate so Far: 1.9 %
Episode 27999  , Epsilon: 0.44  | Completions so Far: 1200  | Success Rate so Far: 4.3 %
Episode 34999  , Epsilon: 0.3  | Completions so Far: 3016  | Success Rate so Far: 8.6 %
Episode 41999  , Epsilon: 0.16  | Completions so Far: 6557  | Success Rate so Far: 15.6 %
Episode 48999  , Epsilon: 0.02  | Completions so Far: 12173  | Success Rate so Far: 24.8 %
Episode 55999  , Epsilon: 0  | Completions so Far: 19155  | Success Rate so Far: 34.2 %
Episode 62999  , Epsilon: 0  | Completions so Far: 26155  | Success Rate so Far: 41.5 %
Episode 69999  , Epsilon: 0  | Completions so Far: 33155  | Success Rate so Far: 47.4 %

Simple Breakdown:
Episodes: 70000
Successful Episodes: 33156
Failed Episodes: 36844
Success Rate: 47

Research Resources:
- AI (CSCI 4610U) Lectures
- ML2 (CSCI 4052U) Lectures
- https://www.youtube.com/watch?v=ZhoIgo3qqLU - FrozenLake Gymnasium
- https://gymnasium.farama.org/environments/toy_text/frozen_lake/


In [89]:
env.close() # Run to  manually close environment if error occurs

NameError: name 'env' is not defined