# 1. Model based 

## Create environment

In [None]:
import numpy as np

class GridWorld:
    def __init__(self, size):
        self.size = size
        self.grid = np.zeros((size, size))
        self.start = (0, 0)
        self.goal = (size-1, size-1)  #last block

    def step(self, action):
        i, k = self.start
        j=k
        # take action
        if action == 0:  # up
            i = max(i-1, 0)
        elif action == 1:  # down
            i = min(i+1, self.size-1)
        elif action == 2:  # left
            j = max(j-1, 0)
        elif action == 3:  # right
            j = min(j+1, self.size-1)

        # calculate reward and update state
        if (i, j) == self.goal:
            reward = 1
            done = True
        else:
            reward = 0
            done = False

        self.start = (i, j)

        return (i, j), reward, done


In this environment, the agent starts in the top-left corner and must navigate to the bottom-right corner to reach the goal state. The step function takes an action (0 = up, 1 = down, 2 = left, 3 = right) and returns the next state, reward, and a boolean indicating whether the episode has terminated.

Note that this environment is considered model-based because we have explicitly defined the transition probabilities (i.e., the new state is determined by the action taken and the current state) and rewards.

##1.1   Value based Algorithm

In [None]:
import numpy as np

class QLearningAgent:
    def __init__(self, env, alpha=0.1, gamma=0.9, eps=0.1):
        self.env = env
        self.alpha = alpha  # learning rate
        self.gamma = gamma  # discount factor
        self.eps = eps  # exploration rate
        self.Q = np.zeros((env.size, env.size, 4))  # 4 represents total number of actions
    def act(self, state):
        if 0 < self.eps:
            return np.random.randint(4)  # choose random action
        else:
            return np.argmax(self.Q[state])

    def train(self, episodes):
        for episode in range(episodes):
            state = self.env.start
            done = False

            while not done:
                action = self.act(state)
                next_state, reward, done = self.env.step(action)

                # update Q-value for current state-action pair
               
                max_next_q = np.max(self.Q[next_state]) if not done else 0 
  
                td_target = reward + self.gamma * max_next_q
                td_error = td_target - self.Q[state][action]
                self.Q[state][action] += self.alpha * td_error

                state = next_state

    def test(self):
        state = self.env.start
        done = False

        while not done:
            action = np.argmax(self.Q[state])
            print("action{}".format(action))
            next_state, reward, done = self.env.step(action)
            state = next_state

        return reward,self.Q


In [None]:
env = GridWorld(4)
agent = QLearningAgent(env)
agent.train(1000)
reward,Q= agent.test()


print(f"Reached goal with reward {reward}")
print("Q value list is as follow:")
print(Q)


This code will train the agent for 1000 episodes and then test it by navigating from the start state to the goal state using the learned Q-table. The output will show whether the agent successfully reached the goal state (reward=1) or not (reward=0) with updated Q table.

# 2. Model Free 

## Create environment

In [None]:
import random

class Environment:
    def __init__(self):
        self.state_space = ['s0', 's1', 's2', 's3']
        self.action_space = ['a0', 'a1', 'a2']
        self.reward_space = {'s0': {'a0': 1, 'a1': 4, 'a2': 0},
                             's1': {'a0': 4, 'a1': 1, 'a2': 4},
                             's2': {'a0': 5, 'a1': 0, 'a2': 2},
                             's3': {'a0': 0, 'a1': 3, 'a2': 0}}
        self.current_state = random.choice(self.state_space)

    def step(self, action):
        reward = self.reward_space[self.state_space[self.current_state]][self.action_space[action]]
        next_state = random.choice(range(len(state_space)))
        self.current_state = next_state
        return next_state, reward

    def reset(self):
        self.current_state = random.choice(range(len(state_space)))
        return self.current_state


In this example, we have a state space of four states ('s0', 's1', 's2', 's3'), an action space of three actions ('a0', 'a1', 'a2'), and a reward space that maps each state-action pair to a reward. The transition dynamics are random, with the next state chosen uniformly at random from the state space. The step function takes an action as input and returns the next state, reward, and a done flag that indicates whether the episode is finished. The reset function resets the environment to a random initial state and returns that state. This is a simple example, but in practice, the state space and action space could be much larger and the transition dynamics more complex.

##1.1   Value based Algorithm

In [None]:
import numpy as np

class QLearningAgent1:
    def __init__(self, env, alpha=0.1, gamma=0.9, eps=0.1):
        self.env = env
        self.alpha = alpha  # learning rate
        self.gamma = gamma  # discount factor
        self.eps = eps  # exploration rate
        self.Q = np.zeros((len(env.state_space), len(env.action_space)))  # 3 represents total number of actions
    def act(self, state):
        if 0 < self.eps:
            return np.random.randint(3)  # choose random action
        else:
            return np.argmax(self.Q[state])

    def train(self, episodes, trajsize):
        for episode in range(episodes):
            state = self.env.reset()
            while trajsize>0:
                action = self.act(state)
                next_state, reward = self.env.step(action)

                # update Q-value for current state-action pair
               
                max_next_q = np.max(self.Q[next_state])  
  
                td_target = reward + self.gamma * max_next_q
                td_error = td_target - self.Q[state][action]
                self.Q[state][action] += self.alpha * td_error

                state = next_state
                trajsize-=1

    def test(self, trajsize):
        state = self.env.reset()

        while trajsize>0:
            action = np.argmax(self.Q[state])
            next_state, reward = self.env.step(action)
            state = next_state
            trajsize-=1
        return reward,self.Q


In [None]:
env = Environment()
agent = QLearningAgent1(env)
agent.train(1000, 50)
reward,Q= agent.test(4)

print(f"Reached goal with reward {reward}")
print("Q value list is as follow:")
print(Q)

Reached goal with reward 3
Q value list is as follow:
[[0.41966151 1.93320758 0.33521994]
 [0.82879939 0.44786282 1.55650642]
 [1.58165659 0.43575932 1.19501751]
 [0.31547024 0.71289493 0.15782569]]


This code will train the agent for 1000 episodes and trajectory 50 and then test it by navigating from the reset state to the goal state using the learned Q-table. The output will show whether the agent successfully reached the goal state (reward=1) or not (reward=0) with updated Q table.