In [None]:
import numpy as np
import gym
import random
from collections import defaultdict 

In [None]:
class Environment:
  def __init__(self, environment_name):
    self.env = gym.make(environment_name)
    self.states_number = self.env.observation_space.n
    self.actions_number = self.env.action_space.n

  def generate_episode(self, policy, max_steps):
      state = self.env.reset()
      
      experience = []
      for step in range(max_steps):
          action = policy.act(state, training = True) 
          new_state, reward, done, info = self.env.step(action)
          experience.append((state, action, reward))
          state = new_state
          if done == True:
             break

      return experience 

In [None]:
class Policy:
  def __init__(self, states_number, actions_number):
     self.Pi = np.full((states_number, actions_number), 1/ actions_number)

  def act(self, state, training = False):
      if training == True:
         actions = [a for a in range(len(self.Pi[state]))]
         action = random.choices(actions, self.Pi[state, :])[0]
      else:
         action = np.argmax(self.Pi[state, :])   
      return action

  def update(self, state, action, value):
      self.Pi[state][action] = value 

In [None]:
class MonteCarloOnPolicy:
     def __init__(self, env, policy):
         self.policy = policy
         self.env = env
         self.Qs = np.zeros((env.states_number, env.actions_number))

     def train(self, max_episodes = 100000, max_steps = 100, discount_rate = 0.98, epsilon = 0.3):
          returns = defaultdict(list)  
          for e in range(max_episodes):
              episode = env.generate_episode(self.policy, max_steps)
              G = 0
              count =0
              for i in reversed(range(0, len(episode))):
                  timestep = episode[i]
                  state = timestep[0]
                  action = timestep[1]
                  reward = timestep[2]
                  G = discount_rate * G + reward 
      
                  if not (state, action) in [(t[0],t[1]) for t in episode[0:i]]:
                     returns[(state, action)].append(G)
                     self.Qs[state][action] = np.mean(np.array(returns[(state, action)]))
                     max_action = np.argmax(self.Qs[state, : ])

                     for action in range(self.env.actions_number):
                         if max_action == action:
                            self.policy.update(state, action, 1 - epsilon + (epsilon / self.env.actions_number)) 
                         else:
                            self.policy.update(state, action, epsilon / self.env.actions_number)
     
          print(self.Qs)

     def act(self, state):
        return self.policy.act(state) 

   

In [None]:
env = Environment("FrozenLake-v0")

policy = Policy(env.states_number, env.actions_number)
agent = MonteCarloOnPolicy(env, policy)
agent.train()

[[0.10454445 0.09559594 0.09889683 0.09149859]
 [0.05748951 0.06436491 0.05930372 0.09048444]
 [0.10032586 0.0910475  0.09902385 0.08661042]
 [0.05527071 0.05460277 0.04963884 0.0804791 ]
 [0.12312643 0.09117876 0.08645882 0.0714447 ]
 [0.         0.         0.         0.        ]
 [0.13644591 0.10109804 0.12867966 0.02932373]
 [0.         0.         0.         0.        ]
 [0.08976574 0.14039456 0.12540382 0.1786621 ]
 [0.18812248 0.28682332 0.23531673 0.15607385]
 [0.33883578 0.30443626 0.24591092 0.12899032]
 [0.         0.         0.         0.        ]
 [0.         0.         0.         0.        ]
 [0.21812244 0.34698722 0.43188239 0.30607072]
 [0.45671521 0.68134188 0.65373953 0.56391634]
 [0.         0.         0.         0.        ]]


In [None]:
wins = 0
done = False
test_env = gym.make("FrozenLake-v0")

for episode in range(1000):
  state = test_env.reset()
  done = False

  for step in range(100):
     action = agent.act(state)
     new_state, reward, done, info = test_env.step(action) 

     if done == True:
        if reward == 1:
           wins +=1
        break
     state = new_state
        
print("wins ratio ", wins/1000) 


wins ratio  0.713


Comparison