In [None]:
################################################################################################
# RUNME ONCE
# Run this cell once in order to install the dependencies needed for the lab. 
# After running this cell, remember to restart the kernel and proceed executing the next cells.
################################################################################################

! pip3 install gymnasium

## Q* Learning 
In this notebook, we'll implement an agent that plays FrozenLake.
The goal of this game is <b>to go from the starting state (S) to the goal state (G)</b> by walking only on frozen tiles (F) and avoid holes (H).However, the ice is slippery, <b>so you won't always move in the direction you intend (stochastic environment)</b>

We use 3 libraries:
- `Numpy` for our Qtable
- `Gymnasium` for our FrozenLake Environment. Read the documentation [here](https://gymnasium.farama.org/).
- `Random` to generate random numbers

In [None]:
import numpy as np
import gymnasium as gym
from gymnasium.envs.toy_text.frozen_lake import generate_random_map
import random
from typing import NamedTuple


### Step 1: Create the environment
- Here we'll create the FrozenLake environment. 
- OpenAI Gym is a library <b> composed of many environments that we can use to train our agents.</b>
- In our case we choose to use Frozen Lake.
- Note that S is the subject, F is frozen, H is the hole, and G the goal.

In [None]:
class Params(NamedTuple):
    total_episodes: int  # Total episodes
    learning_rate: float  # Learning rate
    max_steps: int
    gamma: float  # Discounting rate
    epsilon: float  # Exploration probability
    map_size: int  # Number of tiles of one side of the squared environment
    seed: int  # Define a seed so that we get reproducible results
    is_slippery: bool  # If true the player will move in intended direction with probability of 1/3 else will move in either perpendicular direction with equal probability of 1/3 in both directions
    n_runs: int  # Number of runs
    action_size: int  # Number of possible actions
    state_size: int  # Number of possible states
    proba_frozen: float  # Probability that a tile is frozen
    decay_rate: float

params = Params(
    total_episodes=2000,
    learning_rate=0.8,
    max_steps=99,
    gamma=0.95,
    epsilon=0.1,
    map_size=4,
    seed=123,
    is_slippery=False,
    n_runs=20,
    action_size=None,
    state_size=None,
    proba_frozen=0.9,
    decay_rate=0.005
)
print(params)

# Set the seed
rng = np.random.default_rng(params.seed)

In [None]:
env = gym.make(
    "FrozenLake-v1",
    is_slippery=params.is_slippery,
    render_mode="rgb_array",
    desc=generate_random_map(
        size=params.map_size, p=params.proba_frozen, seed=params.seed
    ),
)


### Step 2: Create the Q-table and initialize it
- Now, we'll create our Q-table, to know how much rows (states) and columns (actions) we need, we need to calculate the action_size and the state_size
- OpenAI Gym provides us a way to do that: `env.action_space.n` and `env.observation_space.n`

In [None]:
params = params._replace(action_size=env.action_space.n)
params = params._replace(state_size=env.observation_space.n)

print(f"Action size: {params.action_size}")
print(f"State size: {params.state_size}")

In [None]:
# Complete the code to create the qtable from the action_size and state_size
qtable = ####################
print(qtable)

### Step 3: The Q learning algorithm
- Now we implement the Q learning algorithm:
<img src="qtable_algo.png" alt="Q algo"/>

In [None]:
# Step 1 --> Q-values are already initialized.

# List of rewards
rewards = []

# Step 2 --> For life or until learning is stopped ...
for episode in range(params.total_episodes):
    # Reset the environment
    state = env.reset()
    step = 0
    done = False
    total_rewards = 0
    
    for step in range(params.max_steps):
        # Step 3 --> Choose an action (a) in the current world state (s)
        ## First we randomize a number
        exp_exp_tradeoff = ####################
        
        ## If this number > greater than epsilon --> exploitation (taking the biggest Q value for this state)
        if exp_exp_tradeoff > epsilon:
            action = # you can use the function argmax from numpy

        # Else doing a random choice --> exploration
        else:
            action = # do something

        # Step 4 --> Take the action (a) and observe the outcome state(s') and reward (r)
        new_state, reward, done, info = # use the method step from env. Check gym library documentation.

        # Step 5 --> Update Q(s,a):= Q(s,a) + lr [R(s,a) + gamma * max Q(s',a') - Q(s,a)]
        # qtable[new_state,:] : all the actions we can take from new state
        qtable[state, action] = # update following Bellman's equation
        
        total_rewards += reward
        
        # Our new state is state
        state = new_state
        
        # If done (if we're dead) : finish episode
        if done == True: 
            break
        
    # Reduce epsilon 
    epsilon = params.min_epsilon + (params.max_epsilon - params.min_epsilon)*np.exp(-params.decay_rate*episode) 
    rewards.append(total_rewards)
    # Why do we need to reduce the epsilon? Comment below:
    '''
    Write your answer here:
    

    '''

print ("Score over time: " +  str(sum(rewards)/params.total_episodes))
print(qtable)

Find the trend of the evolution reward in each step. Your result should follow the trend as in the following figure:

<img src="output.png" alt="output"/>

You can check also the following [link](https://ai.stackexchange.com/questions/34071/is-the-optimal-policy-the-one-with-the-highest-accumulative-reward-q-learning-v) for further information.

In [None]:
# Write here your code to evaluate the evolution of the reward in each step.
# You can copy the entire code for training or modify in the previous cell.




### Question:

Change the hyperparameters and comment about the impact of the map_size, and those related to the epsilon