<a href="https://colab.research.google.com/github/adarsh-nl/Markov-Decision-Process/blob/main/MC_and_TD_control.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [21]:
import gym
import numpy as np

class Agent(gym.Env):
    def __init__(self, n_states, n_actions):
        self.observation_space = gym.spaces.Discrete(n_states)
        self.states = [n for n in range(self.observation_space.n)]
        self.initial_state = self.states[0]
        self.final_state = self.states[-1]
        self.action_space = gym.spaces.Discrete(n_actions)
        self.actions = [n for n in range(self.action_space.n)]
        self.rewards = np.random.rand(n_states, n_actions)
        self.value = np.zeros((n_states, 1))
        #self.policy = np.random.rand(n_states, n_actions)
        #self.policy /= self.policy.sum(axis=1, keepdims=True)
        self.policy = np.random.randint(0, n_states, (n_states, 1))
        self.policy = self.policy.flatten()        
        self.state = 0
        self.iterations = 1000

    def reset(self, state):
        self.state = state
        return self.state

    def value_reset(self):
        self.value = np.zeros((n_states, 1))
    
    def mc_episodes(self, state):
        rewards_ = []
        steps = 0
        while state != self.final_state:
            action = int(np.random.randint(self.action_space.n))
            print
            rewards_.append(self.rewards[state, action])
            next_state = self.policy[state]
            state = next_state
            steps += 1
            if steps > 100:
                break
        gt = sum(rewards_)
        return gt

    def monte_carlo(self):
        for state in self.states:
          for episode in range (1, self.iterations):
              gt = self.mc_episodes(state)
              #self.value[state]  = self.value[state] + (1/episode)*(gt - self.value[state])
              self.value[state]  = (self.value[state] * (episode - 1) + gt )/episode

    # TD (Temporal Difference) Method
    def TD(self):
         # Loop over the number of episodes
        for episode in range(1, self.iterations):
            # Loop over each state
            for state in self.states:
                # Update the value function for each state
                self.value[state] += self.rewards[state, np.random.randint(len(self.states))] + np.random.rand(1)
        
        # Normalize the value function by the number of iterations
        self.value = self.value/self.iterations

    def policy_improvement(self, policy, transition_probs, rewards, discount_factor, V):
        for state in range(self.numstates_):
            action_values = np.zeros(self.actions)
            for action, action_prob in enumerate(policy[state]):
                for next_state, prob in enumerate(transition_probs[state][action]):
                    action_values[action] += action_prob * prob * (rewards[state][action][next_state] + discount_factor * V[next_state])
            policy[state] = np.eye(self.actions)[np.argmax(action_values)]
        return policy

    def policy_iteration(self, n_states, n_actions, rewards, alg='MC', epsilon=0.1):
        steps = 0
        k = 1
        #policy = np.ones([n_states, n_actions]) / n_actions
        policy = self.policy
        #V = np.zeros(n_states)
        V = self.value
        while True:
            if alg == 'MC':
                V = self.value
            elif alg == 'TD':
                self.TD()
                V = self.value
            else:
                raise ValueError(f"Invalid value for 'alg' parameter: {alg}")
            policy_stable = True
            for state in range(n_states):
                steps += 1
                coin_toss = np.random.rand(1)
                if coin_toss < epsilon:
                    # Exploration: choose a random action
                    new_a = np.random.choice(n_actions)
                else:
                    # Exploitation: choose the action with highest Q-value
                    action_values = np.zeros(n_actions)
                    for action in range(n_actions):
                        next_state = action
                        av = rewards[state][action] + V[next_state]
                        action_values[action] = av
                    new_a = np.argmax(action_values)
                chosen_a = np.argmax(policy[state])
                if chosen_a != new_a:
                    policy_stable = False
                    #policy[state] = np.eye(n_actions)[new_a]
                if steps>100:
                  break
            if policy_stable:
                break
        return policy, V

n_states = 5
n_actions = 5

env = Agent(n_states, n_actions)
print(f"observation space: {env.observation_space.n}, states created from observation space: {env.states}")
print(f"action space: {env.action_space.n}, states created from observation space: {env.actions}")
print(f"value of each states:\n {env.value}")
env.monte_carlo()
print(f"value of each states after MC prediction:\n {env.value}")
policy_matrix, value_matrix = env.policy_iteration(n_states, n_actions, env.rewards, alg='MC')
print("The policy after iteration is: {} \n".format(policy_matrix))

env.value_reset()
print('\n-------x-------\n')
print(f"value of each states are reset before implementing TD prediction:\n {env.value}")
env.TD()
print(f"value of each states after TD prediction:\n {env.value}")
policy_matrix, value_matrix = env.policy_iteration(n_states, n_actions, env.rewards, alg='TD')
print("The policy after iteration is: {}\n".format(policy_matrix))

observation space: 5, states created from observation space: [0, 1, 2, 3, 4]
action space: 5, states created from observation space: [0, 1, 2, 3, 4]
value of each states:
 [[0.]
 [0.]
 [0.]
 [0.]
 [0.]]
value of each states after MC prediction:
 [[48.03557604]
 [48.31435447]
 [48.04824438]
 [ 0.42348357]
 [ 0.        ]]
The policy after iteration is: [2 1 1 4 3] 


-------x-------

value of each states are reset before implementing TD prediction:
 [[0.]
 [0.]
 [0.]
 [0.]
 [0.]]
value of each states after TD prediction:
 [[0.84546669]
 [0.96583004]
 [0.87577251]
 [0.92424104]
 [0.79536902]]
The policy after iteration is: [2 1 1 4 3]



In [10]:
np.random.randint(5)

3

In [None]:
env = Agent(3, 3)
env.rewards[1, np.random.randint(len(env.states)-1)]

In [4]:
import numpy as np
np.random.rand(1)

array([0.93872286])