In [None]:
import numpy as np
import gym
import random
from collections import defaultdict 

In [None]:
class Environment:
  def __init__(self, environment_name):
    self.env = gym.make(environment_name)
    self.states_number = self.env.observation_space.n
    self.actions_number = self.env.action_space.n

  def generate_episode(self, policy, max_steps):
      state = self.env.reset()
      experience = []
      for step in range(max_steps):
          action = policy.act(state) 
          new_state, reward, done, info = self.env.step(action)
          experience.append((state, action, reward))
          state = new_state
          if done == True:
             break

      return experience 

In [None]:
class BehaviourPolicy:
  def __init__(self, states_number, actions_number):
     self.Pi = np.full((states_number, actions_number), 1/ actions_number)
    
     
  def act(self, state):
      actions = [a for a in range(len(self.Pi[state]))]
      action = random.choices(actions, self.Pi[state, :])[0]
      return action

  def prob(self, state, action):
      return self.Pi[state][action] 


In [None]:
class TargetPolicy:
  def __init__(self, states_number, actions_number, Qs):
     self.Qs = Qs

  def act(self, state):
      action = np.argmax(self.Qs[state, :])   
      return action

  def prob(self, state, action):
      max_action = action = np.argmax(self.Qs[state, :])
      if action != max_action:
         return 0
      return 1    

In [None]:
class MonteCarloOffPolicy:
     def __init__(self, env, b_policy):
         self.b_policy = b_policy
         self.env = env
         self.Qs = np.zeros((env.states_number, env.actions_number))
         
     def train(self, max_episodes = 1600000, max_steps = 100, discount_rate = 0.98):
          cumulativeW = np.zeros((env.states_number, env.actions_number)) 
          for e in range(max_episodes):
              episode = env.generate_episode(self.b_policy, max_steps)
              G = 0.
              W = 1.
              for t in reversed(range(0, len(episode))):
                  timestep = episode[t]
                  state = timestep[0]
                  action = timestep[1]
                  reward = timestep[2]

                  G = discount_rate * G + reward 
                  if t != len(episode)-1:
                     cumulativeW[state][action] += W
                     self.Qs[state][action] += (W/cumulativeW[state][action]) * (G - self.Qs[state][action])
                     action_max = np.argmax(self.Qs[state, : ])
                     
                     if action != action_max:
                       break
                     W *= 1./b_policy.prob(state, action) 
                 
          print(self.Qs)

     def act(self, state):
        return self.t_policy.act(state) 


In [None]:
env = Environment("FrozenLake-v0")

b_policy = BehaviourPolicy(env.states_number,  env.actions_number)
agent = MonteCarloOffPolicy(env, b_policy)
t_policy = TargetPolicy(env.states_number, env.actions_number, agent.Qs)

agent.train()

In [None]:
wins = 0
done = False
test_env = gym.make("FrozenLake-v0")

print(t_policy.Qs)

for episode in range(1000):
  state = test_env.reset()
  done = False

  for step in range(100):
     action = t_policy.act(state)
     new_state, reward, done, info = test_env.step(action) 

     if done == True:
        if reward == 1:
           wins +=1
        break
     state = new_state
        
print("wins ratio ", wins/1000) 
