<a href="https://colab.research.google.com/github/adarsh-nl/Markov-Decision-Process/blob/main/MC_and_TD_prediction.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import gym
import numpy as np

class Agent(gym.Env):
    """
    This class defines an environment for the agent to learn and explore.
    """

    def __init__(self, n_states, n_actions):
        """
        Initialize the environment with the number of states and actions.
        
        Parameters:
        n_states (int): The number of states in the environment.
        n_actions (int): The number of actions available in the environment.
        """

        # Create the observation space
        self.observation_space = gym.spaces.Discrete(n_states)
        self.states = [n for n in range(self.observation_space.n)]
        self.initial_state = self.states[0]
        self.final_state = self.states[-1]
        
        # Create the action space
        self.action_space = gym.spaces.Discrete(n_actions)
        self.actions = [n for n in range(self.action_space.n)]
        
        # Rewards for each state-action pair
        self.rewards = np.random.rand(n_states, n_actions)
        
        # Value function for each state
        self.value = np.zeros((n_states, 1))
        
        # Initial policy for each state
        self.policy = np.random.randint(0, n_states, (n_states, 1))
        self.policy = self.policy.flatten()
        
        # Current state of the environment
        self.state = 0
        
        # Number of iterations for Monte Carlo and TD prediction
        self.iterations = 1000

    def reset(self, state):
        """
        Reset the environment to a specific state.
        
        Parameters:
        state (int): The state to reset the environment to.
        
        Returns:
        int: The reset state.
        """

        self.state = state
        return self.state

    def value_reset(self):
        """
        Reset the value function for each state to zero.
        """

        self.value = np.zeros((n_states, 1))
    
    def mc_episodes(self, state):
        """
        Generate an episode using Monte Carlo exploration.
        
        Parameters:
        state (int): The starting state of the episode.
        
        Returns:
        float: The total reward obtained in the episode.
        """

        rewards_ = []
        steps = 0
        while state != self.final_state:
            rewards_.append(self.rewards[state, np.random.randint(len(self.states))])
            next_state = self.policy[state]
            state = next_state
            steps += 1
            if steps > 100:
                break
        gt = sum(rewards_)
        return gt

    def monte_carlo(self):
        """
        Implement Monte Carlo prediction to estimate the value function for each state.
        """

        for state in self.states:
          for episode in range (1, self.iterations):
              gt = self.mc_episodes(state)
              #self.value[state]  = self.value[state] + (1/episode)*(gt - self.value[state])
              self.value[state]  = (self.value[state] * (episode - 1) + gt )/episode

    # TD (Temporal Difference) Method
    def TD(self):
        """
        The TD method updates the value function based on temporal differences.
        """
        # Loop over the number of episodes
        for episode in range(1, self.iterations):
            # Loop over each state
            for state in self.states:
                # Update the value function for each state
                self.value[state] += np.random.rand(1)
        
        # Normalize the value function by the number of iterations
        self.value = self.value/self.iterations

# Instantiate the environment
#n_states = int(input("Enter the number of states"))
#n_actions = int(input("Enter the number of actions"))
# Instantiate the environment
n_states = 10
n_actions = 10

# Create an instance of the Agent class
env = Agent(n_states, n_actions)

# Print the observation space and the corresponding states
print(f"observation space: {env.observation_space.n}, states created from observation space: {env.states}")

# Print the action space and the corresponding actions
print(f"action space: {env.action_space.n}, states created from observation space: {env.actions}")

# Print the initial value function
print(f"value of each states:\n {env.value}")

# Run the Monte Carlo prediction method
env.monte_carlo()

# Print the value function after running Monte Carlo prediction
print(f"value of each states after MC prediction:\n {env.value}")

# Reset the value function
env.value_reset()

# Print the reset value function
print('\n-------x-------\n')
print(f"value of each states are reset before implementing TD prediction:\n {env.value}")

# Run the TD prediction method
env.TD()

# Print the value function after running TD prediction
print(f"value of each states after TD prediction:\n {env.value}")

observation space: 10, states created from observation space: [0, 1, 2, 3, 4, 5, 6, 7, 8, 9]
action space: 10, states created from observation space: [0, 1, 2, 3, 4, 5, 6, 7, 8, 9]
value of each states:
 [[0.]
 [0.]
 [0.]
 [0.]
 [0.]
 [0.]
 [0.]
 [0.]
 [0.]
 [0.]]
value of each states after MC prediction:
 [[38.78949316]
 [38.37813275]
 [38.44553487]
 [38.65218908]
 [38.50179054]
 [38.52757707]
 [51.43691397]
 [38.3143613 ]
 [38.29877352]
 [ 0.        ]]

-------x-------

value of each states are reset before implementing TD prediction:
 [[0.]
 [0.]
 [0.]
 [0.]
 [0.]
 [0.]
 [0.]
 [0.]
 [0.]
 [0.]]
value of each states after TD prediction:
 [[0.48627172]
 [0.50203133]
 [0.50123029]
 [0.49453106]
 [0.48870542]
 [0.48771604]
 [0.50165964]
 [0.49508976]
 [0.49729741]
 [0.4998937 ]]
