<a href="https://colab.research.google.com/github/adarsh-nl/Markov-Decision-Process/blob/main/off_policy_learning.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import numpy as np

class Agents:
  def __init__(self, n_states, n_actions):
    self.n_states = n_states
    self.n_actions = n_actions
    self.states = [i for i in range(n_states)]
    self.actions = [i for i in range(n_actions)]
    self.rewards = np.random.rand(n_states, n_actions)
    self.value = np.zeros((n_states, 1))
    #self.policy = np.random.rand(n_states, n_actions)
    #self.policy /= self.policy.sum(axis=1, keepdims=True)
    self.policy = np.random.randint(0, n_states, (n_states, 1))
    self.policy = self.policy.flatten()
    self.state = 0
    self.iterations = 1000
    self.final_state = self.states[-1]
  
  def mc_episodes(self, state):
      rewards_ = []
      steps = 0
      while state != self.final_state:
        rewards_.append(self.rewards[state, np.random.randint(len(self.states))])
        next_state = self.policy[state]
        state = next_state
        steps += 1
        if steps > 100:
          break
      gt = sum(rewards_)
      return gt

  def monte_carlo(self):
    for state in self.states:
      for episode in range (1, self.iterations):
        gt = self.mc_episodes(state)
        self.value[state]  = (self.value[state] * (episode - 1) + gt )/episode
    self.value[-1] = np.mean(self.value)
    return self.value

  def off_policy_learning(self, policy1, policy2, value1, value2):
    state1 = self.state
    state2 = self.state

    for iteration in range(self.iterations):
      if (state1 == self.final_state) and (state2 == self.final_state):
        break
      else:
        action1 = policy1[state1]
        action2 = policy2[state2]
        policy2[state2] = policy1[state2]
        value2[action1] = value1[action1]

        state1 = action1
        state2 = action2
    return policy1, policy2, value1, value2

In [3]:
n_states = 5
n_actions = 5
Agent1 = Agents(n_states, n_actions)
Agent2 = Agents(n_states, n_actions)

print("The Target policy: \n{}\n\nThe intitial action value function: \n{}".format(Agent1.policy, Agent1.value))
print("\n\nThe behavioural policy: \n{}\n\nThe intitial action value function: \n{}".format(Agent2.policy, Agent2.value))

Agent1.value = Agent1.monte_carlo()
Agent1.policy, Agent2.policy, Agent1.value, Agent2.value = Agent2.off_policy_learning(Agent1.policy, Agent2.policy, Agent1.value, Agent2.value)

print("\n\n------------------------------------------------------\n\nThe Target policy: \n{}\n\nThe intitial action value function: \n{}".format(Agent1.policy, Agent1.value))
print("\n\nThe behavioural policy: \n{}\n\nThe intitial action value function: \n{}".format(Agent2.policy, Agent2.value))


The Target policy: 
[0 4 1 3 1]

The intitial action value function: 
[[0.]
 [0.]
 [0.]
 [0.]
 [0.]]


The behavioural policy: 
[3 4 2 4 1]

The intitial action value function: 
[[0.]
 [0.]
 [0.]
 [0.]
 [0.]]


------------------------------------------------------

The Target policy: 
[0 4 1 3 1]

The intitial action value function: 
[[50.1936935 ]
 [ 0.36480052]
 [ 0.96774284]
 [43.41319886]
 [18.98788715]]


The behavioural policy: 
[0 4 2 3 1]

The intitial action value function: 
[[50.1936935]
 [ 0.       ]
 [ 0.       ]
 [ 0.       ]
 [ 0.       ]]
