# 1. Monte Carlo Methods

## 1.1 Understanding the problem : Frozen Lake

The agent controls the movement of a character in a grid world. Some tiles of the grid are walkable, and others lead to the agent falling into the water. Additionally, the movement direction of the agent is uncertain and only partially depends on the chosen direction. The agent is rewarded for finding a walkable path to a goal tile.

![](frozen.png)

The surface is described using a grid like the following:

- SFFF       (S: starting point, safe)
- FHFH       (F: frozen surface, safe)
- FFFH       (H: hole, fall to your doom)
- HFFG       (G: goal, where the frisbee is located)


The episode ends when you reach the goal or fall in a hole. You receive a reward of 1 if you reach the goal, and zero otherwise.

## 1.2 Setting the environment

In [1]:
# src1 (not working)- https://www.analyticsvidhya.com/blog/2018/11/reinforcement-learning-introduction-monte-carlo-learning-openai-gym/
# src2 https://harderchoices.com/2018/04/04/monte-carlo-method-in-python/
import gym #For model of RL Problem
import numpy as np #for numpy array anf matrices
import operator #??
from IPython.display import clear_output #to clear output of previous execution
from time import sleep #for delay
from gym.spaces.tuple_space import Tuple #??
from gym.envs.registration import register# for using custom environment of Frozen Lake
import random #for random number generation
import itertools #??
#import tqdm
#tqdm.monitor_interval = 0

#Multiple execution of this cell will result re registration of environment which will generate error
register(
    id='FrozenLakeNotSlippery-v0',
    entry_point='gym.envs.toy_text:FrozenLakeEnv',
    kwargs={'map_name' : '4x4', 'is_slippery': False},
    max_episode_steps=200)

register(
    id='FrozenLakeNotSlippery8x8-v0',
    entry_point='gym.envs.toy_text:FrozenLakeEnv',
    kwargs={'map_name' : '8x8', 'is_slippery': False},
    max_episode_steps=200)
    
fl_slippery = {
    'small': 'FrozenLake-v0',
    'big': 'FrozenLake8x8-v0'
}

fl_not_slippery = {
    'small': 'FrozenLakeNotSlippery-v0',
    'big': 'FrozenLakeNotSlippery8x8-v0'
}


ModuleNotFoundError: No module named 'gym.spaces.tuple_space'

## Creating a new environment

In [None]:
def create_environment(slippery=False, big=False):
    if slippery:
        env = gym.make(fl_slippery['big'] if big else fl_slippery['small'])
    else:
        env = gym.make(fl_not_slippery['big'] if big else fl_not_slippery['small'])
    env.reset()
    return env

## 1.3 Creating random policy

In [None]:
def create_random_policy(env):
    policy = {}
    for key in range(0, env.observation_space.n):#for each state
        current_end = 0
        p = {}
        for action in range(0, env.action_space.n):#for each action corresponding to each state
            p[action] = 1 / env.action_space.n#equal probabilty
        policy[key] = p #each state is assigned actions with equal probability
    return policy

## 1.4 Creating a dictionary for Q Table (state action value)

In [None]:
def create_state_action_dictionary(env, policy):
    Q = {}
    for key in policy.keys():
        Q[key] = {a: 0.0 for a in range(0, env.action_space.n)}
    return Q    

## 1.5 Episodic Task

In [None]:
def run_game(env, policy, display=True):
    env.reset()
    episode = []
    finished = False
    
    while not finished:
        s = env.env.s
            
        if display:
            clear_output(True)
            env.render()
            sleep(0.1)

        timestep = []
        timestep.append(s)
        
        n = random.uniform(0, sum(policy[s].values()))
        top_range = 0
        for prob in policy[s].items():
            top_range += prob[1]
            if n < top_range:
                action = prob[0]
                break   
        
        state, reward, finished, info =  env.step(action)
        timestep.append(action)
        timestep.append(reward)
        
        episode.append(timestep)
        
    if display:
        clear_output(True)
        env.render()
        sleep(0.05)
    
    return episode

## 1.6 Testing policy and displaying win percentage

In [None]:
def test_policy(policy, env):
    wins = 0
    r = 100
    for i in range(r):
        w = run_game(env, policy, display=False)[-1][-1]
        if w == 1:
            wins += 1
    return wins / r

## Testing Random Policy

## 1.7 First visit Monte Carlo prediction

In [None]:
def monte_carlo_e_soft(env, episodes=100, policy=None, epsilon=0.01):
    if not policy:
        policy = create_random_policy(env)  # Create an empty dictionary to store state action values    
    Q = create_state_action_dictionary(env, policy) # Empty dictionary for storing rewards for each state-action pair
    returns = {} # 3.
    
    for _ in range(episodes): # Looping through episodes
        G = 0 # Store cumulative reward in G (initialized at 0)
        episode = run_game(env=env, policy=policy, display=False) # Store state, action and value respectively 
        
        # for loop through reversed indices of episode array. 
        # The logic behind it being reversed is that the eventual reward would be at the end. 
        # So we have to go back from the last timestep to the first one propagating result from the future.
        
        for i in reversed(range(0, len(episode))):   
            s_t, a_t, r_t = episode[i] 
            state_action = (s_t, a_t)
            G += r_t # Increment total reward by reward on current timestep
            
            if not state_action in [(x[0], x[1]) for x in episode[0:i]]: # 
                if returns.get(state_action):
                    returns[state_action].append(G)
                else:
                    returns[state_action] = [G]   
                    
                Q[s_t][a_t] = sum(returns[state_action]) / len(returns[state_action]) # Average reward across episodes
                
                Q_list = list(map(lambda x: x[1], Q[s_t].items())) # Finding the action with maximum value
                indices = [i for i, x in enumerate(Q_list) if x == max(Q_list)]
                max_Q = random.choice(indices)
                
                A_star = max_Q # 14.
                
                for a in policy[s_t].items(): # Update action probability for s_t in policy
                    if a[0] == A_star:
                        policy[s_t][a[0]] = 1 - epsilon + (epsilon / abs(sum(policy[s_t].values())))
                    else:
                        policy[s_t][a[0]] = (epsilon / abs(sum(policy[s_t].values())))

    return policy

## 1.8 Executing and testing the built model

In [None]:
env =gym.make('FrozenLakeNotSlippery-v0')
#policy = monte_carlo_e_soft(env, episodes=1)
test_policy(policy,env)

## 1.9 Model with 50 Episodes

## 1.10 Model with 100 Episodes

## 1.11 What are your findings?

In [None]:
import gym
import numpy as np
import operator
from IPython.display import clear_output
from time import sleep
import random
import itertools
#import tqdm

#tqdm.monitor_interval = 0
def create_random_policy(env):
    policy = {}
    for key in range(0, env.observation_space.n):#for each state
        current_end = 0 #???
        p = {}
        for action in range(0, env.action_space.n): #for each action corresponding to each state
            p[action] = 1 / env.action_space.n #equal probabilty
        policy[key] = p #each state is assigned actions with equal probability
    return policy
def create_state_action_dictionary(env, policy):
    Q = {}
    for key in policy.keys(): #for each state
         Q[key] = {a: 0.0 for a in range(0, env.action_space.n)}#??
    return Q
def run_game(env, policy, display=True):
    env.reset()
    episode = []
    finished = False
    
    while not finished:
        s = env.env.s
        if display:
            clear_output(True)
            env.render()
            sleep(1)
            
            timestep = []
            timestep.append(s)
            n = random.uniform(0, sum(policy[s].values()))
            top_range = 0
            for prob in policy[s].items():
                top_range += prob[1]
                if n < top_range:
                    action = prob[0]
                    break 
            state, reward, finished, info = env.step(action)
            timestep.append(action)
            timestep.append(reward)
            episode.append(timestep)
    
    if display:
        clear_output(True)
        env.render()
        sleep(1)
        return episode
def test_policy(policy, env):
    wins = 0
    r = 100
    for i in range(r):
        w = run_game(env, policy, display=False)[-1][-1]
        if w == 1:
            wins += 1
    return wins / r
def monte_carlo_e_soft(env, episodes=100, policy=None, epsilon=0.01):
    if not policy:
        policy = create_random_policy(env)  # Create an empty dictionary to store state action values    
    Q = create_state_action_dictionary(env, policy) # Empty dictionary for storing rewards for each state-action pair
    returns = {} # 3.
    for _ in range(episodes): # Looping through episodes
        G = 0 # Store cumulative reward in G (initialized at 0)
        episode = run_game(env=env, policy=policy, display=False) # Store state, action and value respectively 
        
        # for loop through reversed indices of episode array. 
        # The logic behind it being reversed is that the eventual reward would be at the end. 
        # So we have to go back from the last timestep to the first one propagating result from the future.
        print('hi1')
        for i in reversed(range(0, len(episode))):   
            s_t, a_t, r_t = episode[i] 
            state_action = (s_t, a_t)
            G += r_t # Increment total reward by reward on current timestep
            print('hi2')
            if not state_action in [(x[0], x[1]) for x in episode[0:i]]: # 
                if returns.get(state_action):
                    returns[state_action].append(G)
                else:
                    returns[state_action] = [G]   
                    
                Q[s_t][a_t] = sum(returns[state_action]) / len(returns[state_action]) # Average reward across episodes
                
                Q_list = list(map(lambda x: x[1], Q[s_t].items())) # Finding the action with maximum value
                indices = [i for i, x in enumerate(Q_list) if x == max(Q_list)]
                max_Q = random.choice(indices)
                
                A_star = max_Q # 14.
                
                for a in policy[s_t].items(): # Update action probability for s_t in policy
                    if a[0] == A_star:
                        policy[s_t][a[0]] = 1 - epsilon + (epsilon / abs(sum(policy[s_t].values())))
                    else:
                        policy[s_t][a[0]] = (epsilon / abs(sum(policy[s_t].values())))

    return policy
env =gym.make('FrozenLake-v0')
policy = monte_carlo_e_soft(env, episodes=1)
test_policy(policy,env)

In [None]:
import gym
import numpy as np
import operator
from IPython.display import clear_output
from time import sleep
from gym.spaces.tuple_space import Tuple
from gym.envs.registration import register
import random
import itertools
#import tqdm

#tqdm.monitor_interval = 0

register(
    id='FrozenLakeNotSlippery-v0',
    entry_point='gym.envs.toy_text:FrozenLakeEnv',
    kwargs={'map_name' : '4x4', 'is_slippery': False},
    max_episode_steps=200
)

register(
    id='FrozenLakeNotSlippery8x8-v0',
    entry_point='gym.envs.toy_text:FrozenLakeEnv',
    kwargs={'map_name' : '8x8', 'is_slippery': False},
    max_episode_steps=200
)
fl_slippery = {
    'small': 'FrozenLake-v0',
    'big': 'FrozenLake8x8-v0'
}

fl_not_slippery = {
    'small': 'FrozenLakeNotSlippery-v0',
    'big': 'FrozenLakeNotSlippery8x8-v0'
}
def create_environment(slippery=False, big=False):
    if slippery:
        env = gym.make(fl_slippery['big'] if big else fl_slippery['small'])
    else:
        env = gym.make(fl_not_slippery['big'] if big else fl_not_slippery['small'])
    env.reset()
    return env

def create_random_policy(env):
    policy = {}
    for key in range(0, env.observation_space.n):
        current_end = 0
        p = {}
        for action in range(0, env.action_space.n):
            p[action] = 1 / env.action_space.n
        policy[key] = p
    return policy


def create_state_action_dictionary(env, policy):
    Q = {}
    for key in policy.keys():
        Q[key] = {a: 0.0 for a in range(0, env.action_space.n)}
    return Q    

def run_game(env, policy, display=True):
    env.reset()
    episode = []
    finished = False
    
    while not finished:
        s = env.env.s
            
        if display:
            clear_output(True)
            env.render()
            sleep(0.1)

        timestep = []
        timestep.append(s)
        
        n = random.uniform(0, sum(policy[s].values()))
        top_range = 0
        for prob in policy[s].items():
            top_range += prob[1]
            if n < top_range:
                action = prob[0]
                break   
        
        state, reward, finished, info =  env.step(action)
        timestep.append(action)
        timestep.append(reward)
        
        episode.append(timestep)
        
    if display:
        clear_output(True)
        env.render()
        sleep(0.05)
    
    return episode

def test_policy(policy, env):
    wins = 0
    r = 100
    for i in range(r):
        w = run_game(env, policy, display=False)[-1][-1]
        if w == 1:
            wins += 1
    return wins / r
env = create_environment(slippery=True, big=False)
_ = run_game(env, create_random_policy(env))
def monte_carlo_e_soft(env, episodes=100, policy=None, epsilon=0.01):
    if not policy:
        policy = create_random_policy(env) # 1. 
        
    Q = create_state_action_dictionary(env, policy) # 2.
    returns = {} # 3.
    
    for _ in range(episodes): # 4.
        G = 0 # 5.
        episode = run_game(env=env, policy=policy, display=False) # 6.
        for i in reversed(range(0, len(episode))): # 7.
            s_t, a_t, r_t = episode[i] # 8. 
            state_action = (s_t, a_t)
            G += r_t # 9.
            
            if not state_action in [(x[0], x[1]) for x in episode[0:i]]: # 10.
                if returns.get(state_action): # 11.
                    returns[state_action].append(G)
                else:
                    returns[state_action] = [G]   
                    
                Q[s_t][a_t] = sum(returns[state_action]) / len(returns[state_action]) # 12.
                
                Q_list = list(map(lambda x: x[1], Q[s_t].items())) # 13.
                indices = [i for i, x in enumerate(Q_list) if x == max(Q_list)]
                max_Q = random.choice(indices)
                
                A_star = max_Q # 14.
                
                for a in policy[s_t].items(): # 15.
                    if a[0] == A_star:
                        policy[s_t][a[0]] = 1 - epsilon + (epsilon / abs(sum(policy[s_t].values())))
                    else:
                        policy[s_t][a[0]] = (epsilon / abs(sum(policy[s_t].values())))

    return policy
env = create_environment(slippery=False, big=False)
policy = monte_carlo_e_soft(env, episodes=200)
test_policy(policy, env)
_ = run_game(env, policy)
env = create_environment(slippery=False, big=True)
policy = monte_carlo_e_soft(env, episodes=10000)
test_policy(policy, env)
_ = run_game(env, policy)

# Learning to navigate a car without Reinforcement Learning

- OpenAI Gym Library https://gym.openai.com
- Contains various models for researchers to practice Reinforcement Learning problems
- keras-rl framework

![](https://camo.githubusercontent.com/780b18443ca6ff68004fc01b29e59367b5a70300/68747470733a2f2f7170682e66732e71756f726163646e2e6e65742f6d61696e2d71696d672d65666133343639353532386435326463643036633535643564396234366265662d63)

In [None]:
import gym
from time import sleep

# Creating thr env
env = gym.make("Taxi-v2").env

env.s = 328


# Setting the number of iterations, penalties and reward to zero,
epochs = 0
penalties, reward = 0, 0

frames = []

#done represents when our goal is reached
done = False

while not done:
    action = env.action_space.sample() #randomly select a sample
    state, reward, done, info = env.step(action)
    if reward == -10:
        penalties += 1

    # Put each rendered frame into the dictionary for animation
    frames.append({
        'frame': env.render(mode='ansi'),
        'state': state,
        'action': action,
        'reward': reward
    }
    )

    epochs += 1

# Printing all the possible actions, states, rewards.
def frames1(frames):
    for i, frame in enumerate(frames):
        from IPython.display import clear_output
        clear_output(wait=True)
        print(frame['frame'].getvalue())
        print(f"Timestep: {i + 1}")
        print(f"State: {frame['state']}")
        print(f"Action: {frame['action']}")
        print(f"Reward: {frame['reward']}")
        sleep(.1)
        
frames1(frames)
print("Timesteps taken: {}".format(epochs))
print("Penalties incurred: {}".format(penalties))
env.close()


# A Quick Dive into Deep Reinforcement Learning - CartPole

In [None]:
#! source activate /anaconda3/envs/deeplearning/
#! pip install keras-rl

In [None]:
#Src https://www.analyticsvidhya.com/blog/2017/01/introduction-to-reinforcement-learning-implementation/


import numpy as np
import gym

from keras.models import Sequential
from keras.layers import Dense, Activation, Flatten
from keras.optimizers import Adam

from rl.agents.dqn import DQNAgent
from rl.policy import EpsGreedyQPolicy
from rl.memory import SequentialMemory
ENV_NAME = 'CartPole-v0'

# Get the environment and extract the number of actions available in the Cartpole problem
env = gym.make(ENV_NAME)
np.random.seed(123)
env.seed(123)
nb_actions = env.action_space.n
model = Sequential()
model.add(Flatten(input_shape=(1,) + env.observation_space.shape))
model.add(Dense(16))
model.add(Activation('relu'))
model.add(Dense(nb_actions))
model.add(Activation('linear'))
print(model.summary())
policy = EpsGreedyQPolicy()
memory = SequentialMemory(limit=50000, window_length=1)
dqn = DQNAgent(model=model, nb_actions=nb_actions, memory=memory, nb_steps_warmup=10,
target_model_update=1e-2, policy=policy)
dqn.compile(Adam(lr=1e-3), metrics=['mae'])

# Okay, now it's time to learn something! We visualize the training here for show, but this slows down training quite a lot. 
dqn.fit(env, nb_steps=5000, visualize=False, verbose=2)
dqn.test(env, nb_episodes=5, visualize=True)

