In [1]:
import gym
import pandas as pd
import numpy as np
from collections import defaultdict

# Solving the Frozen Lake Problem with Value Iteration

In [2]:
env = gym.make('FrozenLake-v1')
env

<TimeLimit<OrderEnforcing<PassiveEnvChecker<FrozenLakeEnv<FrozenLake-v1>>>>>

In [None]:
env.P[0][1]

### Computing optimal value function

In [3]:
def value_iteration(env):
    
    num_iterations = 1000
    threshold = 1e-20
    gamma = 1.0
    
    value_table = np.zeros(env.observation_space.n)
    
    for i in range(num_iterations):
        updated_value_table = np.copy(value_table)
        
        for s in range(env.observation_space.n):
            Q_values = [
                sum([prob * (r + gamma*updated_value_table[s_]) for prob, s_, r, _ in env.P[s][a]])
                for a in range(env.action_space.n)
                ]
            
            value_table[s] = max(Q_values)
            
        if (np.sum(np.fabs(updated_value_table - value_table)) <= threshold):
            break
    return value_table

In [16]:
[sum([a, b]) for a, b in zip(range(2), range(2)) for _ in range(3) ]

[0, 0, 0, 2, 2, 2]

### Extracting optimal policy from the optimal value function

In [4]:
def extract_policy(value_table):
    
    gamma = 1.0
    policy = np.zeros(env.observation_space.n)
    
    for s in range(env.observation_space.n):
        Q_values = [sum(
            [prob * (r + gamma*value_table[s_])
             for prob, s_, r, _ in env.P[s][a]])
                    for a in range(env.action_space.n)
                    ]
        policy[s] = np.argmax(np.array(Q_values))
    
    return policy 


In [5]:
optimal_value_function = value_iteration(env)
optimal_value_function

array([0.82352941, 0.82352941, 0.82352941, 0.82352941, 0.82352941,
       0.        , 0.52941176, 0.        , 0.82352941, 0.82352941,
       0.76470588, 0.        , 0.        , 0.88235294, 0.94117647,
       0.        ])

In [None]:
optimal_policy = extract_policy(optimal_value_function)
optimal_policy

### Computing value function using policy

In [None]:
def compute_value_function(policy):
    num_iterations = 1000
    threshold = 1e-20
    gamma = 1.0
    value_table = np.zeros(env.observation_space.n)
    
    for i in range(num_iterations):
        updated_value_table = np.copy(value_table)
        
        for s in range(env.observation_space.n):
            a = policy[s]
            value_table[s] = sum([prob * (r + gamma * updated_value_table[s_])
                                  for prob, s_, r, _ in env.P[s][a]
                                  ])
        
        if (np.sum((np.fabs(updated_value_table - value_table))) < threshold):
            break
    return value_table

In [None]:
def policy_iteration(env):
    
    num_iterations = 1000
    policy = np.zeros(env.observation_space.n)
    
    for i in range(num_iterations):
        
        value_function = compute_value_function(policy)
        new_policy = extract_policy(value_function)
        
        if (np.all(policy == new_policy)):
            break
        
        policy = new_policy
    
    return policy

In [None]:
optimal_policy = policy_iteration(env)
optimal_policy

# Monte Carlo

In [None]:
env = gym.make('Blackjack-v1')

In [22]:
def policy(state):
    return 0 if state[0] > 15 else 1

In [None]:
state = env.reset()
state

In [None]:
player = state[0][0]
dealer = state[0][1]
usable = state[0][2]

print(f"player : {player}   dealer : {dealer}   usable : {usable}")

In [None]:
env.step(1)

In [None]:
def generate_episode(policy):
    
    episode = []
    state = env.reset()[0]
    num_timestep = 100
    
    for i in range(num_timestep):        
        action = policy(state)
        next_state, reward, done, info, _ = env.step(action)
        
        episode.append(
            (state, action, reward)
        )
        if done:
            break
        
        state = next_state
    
    return episode


episode = generate_episode(policy)
episode

In [None]:
a, b, c = zip(*episode)
a

## Value Function

In [None]:
total_return = defaultdict(float)
N = defaultdict(int)

num_iterations = 100

for i in range(num_iterations):
    
    episode = generate_episode(policy)
    states, actions, rewards = zip(*episode)
    
    for t, state in enumerate(states):
        R = (sum(rewards[t:]))
        total_return[state] = total_return[state] + R
        N[state] = N[state] + 1


In [None]:
total_return

In [None]:
len(total_return)

In [None]:
total_return = pd.DataFrame(total_return.items(),columns=['state', 'total_return'])
total_return

In [None]:
N = pd.DataFrame(N.items(),columns=['state', 'N'])
N

In [None]:
df = pd.merge(total_return, N, on="state")
df

In [None]:
df['value'] = df['total_return']/df['N']
df

In [None]:
total_return = defaultdict(float)
N = defaultdict(int)
num_iterations = 10000

for i in range(num_iterations):
    
    episode = generate_episode(policy)
    states, actions, rewards = zip(*episode)
    
    for t, state in enumerate(states):
        
        if state not in states[0:t]:    
            R = (sum(rewards[t:]))
            total_return[state] = total_return[state] + R
            
            N[state] = N[state] + 1
        

In [None]:
total_return = pd.DataFrame(total_return.items(),columns=['state', 'total_return'])
N = pd.DataFrame(N.items(),columns=['state', 'N'])

df = pd.merge(total_return, N, on="state")
df['value'] = df['total_return']/df['N']
df

# Implementing On-policy MC control

In [None]:
import gym
import pandas as pd
from collections import defaultdict
import random

In [None]:
Q = defaultdict(float)
total_return = defaultdict(float)
N = defaultdict(int)


In [None]:
def epsilon_greedy_policy(state,Q):
    
    epsilon = 0.5
    
    if random.uniform(0, 1) < epsilon:
        return env.action_space.sample()
    else:
        return max(
            list(range(env.action_space.n)), key = lambda x : Q[(state, x)]
        )


## Generating an episode

In [None]:
num_timesteps = 100

def generate_episode(Q):
    
    episode = env.reset()
    
    for t in range(num_timesteps):
        
        action = epsilon_greedy_policy(state, Q)
        next_state, reward, done, info = env.step(action)
        episode.append(
            (state, action, reward)
        )
        
        if done:
            break
    return episode

In [4]:
a = env.action_space.sample()
a

0

In [5]:
state, reward, done, info, _ = env.step(a)

  if not isinstance(terminated, (bool, np.bool8)):


In [None]:
total_return

# Frozenlake Value iteration

In [None]:
import gym
import collections



ENV_NAME = "FrozenLake-v1"
#ENV_NAME = "FrozenLake8x8-v0"      # uncomment for larger version
GAMMA = 0.9
TEST_EPISODES = 20

In [None]:
class Agent:
    
    def __init__(self):
        self.env = gym.make(ENV_NAME)
        self.state = self.env.reset()
        self.rewards = collections.defaultdict(float)
        self.transits = collections.defaultdict(
            collections.Counter
        )
        self.values = collections.defaultdict(float)
    
    
    def play_n_random_steps(self, count):
        for _ in range(count):
            action = self.env.action_space.sample()
            new_state, reward, is_done, _ = self.env.step(action)
            self.rewards[(self.state, action, new_state)] = reward
            self.transits[(self.state, action, new_state)] += 1
            self.state = self.env.reset() if is_done else new_state
    
    
    

In [None]:
test_env = gym.make(ENV_NAME)
agent = Agent()

agent.play_n_random_steps(30)

In [None]:
test_env.reset()

In [None]:
a = test_env.action_space.sample()
test_env.step(a)

# DQN

In [17]:
import random
import gym
import numpy as np
from collections import deque
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Activation, Flatten, Conv2D, MaxPooling2D
from tensorflow.keras.optimizers import Adam

In [None]:
env = gym.make("MsPacman-v1")

In [None]:
state_size = (88, 80, 1)

In [None]:
action_size = env.action_space.n

# Policy Gradient

In [2]:
import tensorflow as tf
print(tf.__version__)

2.10.0


In [3]:
import warnings
warnings.filterwarnings('ignore')

import numpy as np
import gym

In [4]:
import tensorflow.compat.v1 as tf
tf.disable_v2_behavior()

Instructions for updating:
non-resource variables are not supported in the long term


In [7]:
env = gym.make('CartPole-v1')

gamma = 0.95

state_shape = env.observation_space.shape[0]
num_actions = env.action_space.n

In [5]:
def discount_and_normalize_rewards(episode_rewards):
    discounted_rewards = np.zeros_like(episode_rewards)
    reward_to_go = 0.0
    
    for i in reversed(range(len(episode_rewards))):
        reward_to_go = reward_to_go * gamma + episode_rewards[i]
        discounted_rewards[i] = reward_to_go
        
    discounted_rewards -= np.mean(discounted_rewards)
    discounted_rewards /= np.std(discounted_rewards)
    
    return discounted_rewards



# Building the policy network

In [8]:
state_ph = tf.placeholder(tf.float32, [None, state_shape], name="state_ph")

action_ph = tf.placeholder(tf.int32, [None, num_actions],  name="action_ph")

discounted_rewards_ph = tf.placeholder(tf.float32, [None,], name="discounted_rewards")

In [9]:
layer1 = tf.layers.dense(state_ph, units = 32, activation=tf.nn.relu)

layer2 = tf.layers.dense(layer1, units=num_actions)

prob_dist = tf.nn.softmax(layer2)

In [10]:
neg_log_policy = tf.nn.softmax_cross_entropy_with_logits_v2(logits=layer2, labels=action_ph)

loss = tf.reduce_mean(neg_log_policy * discounted_rewards_ph)

train = tf.train.AdamOptimizer(0.01).minimize(loss)

## Training the network

In [12]:
num_iterations = 1000

# start the TensorFlow session
with tf.Session() as sess:
    
    # initialize all the TensorFlow variables
    sess.run(tf.global_variables_initializer())
    
    # for every iteration
    for i in range(num_iterations):
        # initialize an empty list for storing the states, actions, and rewards obtained in the episode
        episode_states, episode_actions, episode_rewards = [], [], []
        
        # set the done to False
        done = False
        
        # initialize the state by resetting the environment
        state = env.reset()[0]
        
        # initialize the return
        Return=0
        
        while not done:
            
            # reshape the state
            state = state.reshape([1, 4])
            
            pi = sess.run(prob_dist, feed_dict = {state_ph : state})
            
            # select an action using this stochastic policy
            a = np.random.choice(range(pi.shape[1]), p=pi.ravel())
            
            # perform the selected action
            next_state, reward, done, info, _ = env.step(a)
            
            env.render()
            
            # update the return
            Return += reward
            
            # one-hot encode the action
            action = np.zeros(num_actions)
            action[a] = 1
            
            # store the state, action, and reward into their respective list
            episode_states.append(state)
            episode_actions.append(action)
            episode_rewards.append(reward)
            
            # update the state to the next state
            state = next_state
        
        # Compute the discounted and normalized reward
        discounted_rewards = discount_and_normalize_rewards(episode_rewards)
        
        # define the feed dictionary
        feed_dict = {state_ph : np.vstack(np.array(episode_states)),
                     action_ph : np.vstack(np.array(episode_actions)),
                     discounted_rewards_ph : discounted_rewards
                     }
        
        # train the network
        loss, _ = sess.run([loss, train], feed_dict = feed_dict)
        
        # print the return for every 10 iteration
        if i%10 == 0:
            print("Iteration : {}, Return : {}".format(i, Return))
        

Iteration : 0, Return : 10.0


TypeError: Argument `fetch` = 0.2018943727016449 has invalid type "float32" must be a string or Tensor. (Can not convert a float32 into a Tensor or Operation.)

In [None]:
n=