In [1]:
import numpy as np
import random

class MultiArmBandit:
    def __init__(self, num_arms, epsilon):
        self.num_arms = num_arms
        self.arms = np.random.rand(num_arms)
        self.num_pulls = np.zeros(num_arms)
        self.avg_rewards = np.zeros(num_arms)
        self.epsilon = epsilon

    def pull_arm(self, arm):
        reward = np.random.normal(self.arms[arm], 1)
        self.num_pulls[arm] += 1
        self.avg_rewards[arm] = self.avg_rewards[arm] + (1 / self.num_pulls[arm]) * (reward - self.avg_rewards[arm])
        return reward

    def epsilon_greedy(self):
        if np.random.rand() < self.epsilon:
            return random.randint(0, self.num_arms-1)
        else:
            return np.argmax(self.avg_rewards)

    def run_bandit(self, num_iterations):
        for i in range(num_iterations):
            arm = self.epsilon_greedy()
            reward = self.pull_arm(arm)
            print("Iteration: {}, Arm: {}, Reward: {}".format(i+1, arm+1, reward))

if __name__ == '__main__':
    num_arms = 5
    epsilon = 0.1
    num_iterations = 1000
    bandit = MultiArmBandit(num_arms, epsilon)
    bandit.run_bandit(num_iterations)

Iteration: 1, Arm: 3, Reward: 0.8427759508189916
Iteration: 2, Arm: 3, Reward: -0.10625092868157981
Iteration: 3, Arm: 3, Reward: 2.3747091415078843
Iteration: 4, Arm: 3, Reward: 1.1566899137883562
Iteration: 5, Arm: 3, Reward: 0.6999755986004063
Iteration: 6, Arm: 3, Reward: 1.2485167855616153
Iteration: 7, Arm: 3, Reward: 1.1174953954106126
Iteration: 8, Arm: 3, Reward: 0.6847488807395656
Iteration: 9, Arm: 3, Reward: 0.3139461209446696
Iteration: 10, Arm: 3, Reward: -0.5023169180281434
Iteration: 11, Arm: 4, Reward: -0.09211653457983215
Iteration: 12, Arm: 3, Reward: -0.2578592399451892
Iteration: 13, Arm: 3, Reward: 0.6796628080099467
Iteration: 14, Arm: 3, Reward: 0.31878906211234503
Iteration: 15, Arm: 3, Reward: 0.8681633002576767
Iteration: 16, Arm: 3, Reward: 1.8369770696647474
Iteration: 17, Arm: 3, Reward: 0.5953344075950456
Iteration: 18, Arm: 3, Reward: -0.8667554937829912
Iteration: 19, Arm: 3, Reward: -0.38748087768431383
Iteration: 20, Arm: 3, Reward: 0.80770851463926
I

In [2]:
#In the above code, the epsilon parameter determines the probability of selecting a random arm, rather than the one with the highest average reward.

#The epsilon_greedy method implements the epsilon-greedy algorithm by randomly selecting an arm with probability epsilon, or selecting the arm with the highest average reward with probability 1 - epsilon.

#The run_bandit method runs the bandit algorithm for a given number of iterations, selecting an arm using the epsilon-greedy algorithm and pulling the arm to get a reward. The iteration number, arm number, and reward obtained are printed out.

In [3]:
import numpy as np

def iterative_policy_evaluation(env, policy, gamma=1.0, theta=1e-8):
    """
    Iterative Policy Evaluation Algorithm

    Args:
    env: OpenAI gym environment
    policy: 2D numpy array with shape (S, A), where S is the number of states and A is the number of actions
    gamma: discount factor, default to 1.0
    theta: stopping threshold, default to 1e-8

    Returns:
    V: 1D numpy array with shape (S), the estimated value function of the given policy
    """

    # initialize value function V(s) for all states to 0
    V = np.zeros(env.nS)

    while True:
        delta = 0

        # for each state s in the environment
        for s in range(env.nS):
            v = 0

            # for each action a in the policy
            for a, action_prob in enumerate(policy[s]):
                # for each possible next state s', reward r, and probability p
                for prob, next_state, reward, done in env.P[s][a]:
                    # calculate expected value of taking action a in state s
                    v += action_prob * prob * (reward + gamma * V[next_state])

            # update the value function V(s)
            delta = max(delta, abs(V[s] - v))
            V[s] = v

        # check if the value function has converged
        if delta < theta:
            break

    return V

In [4]:
#This implementation assumes that you're using an OpenAI gym environment, where env.nS is the number of states in the environment, and env.P[s][a] is a list of tuples representing the transition dynamics for state s and action a. Each tuple contains four elements: the probability prob of transitioning to the next state, the next state next_state, the reward reward received for the transition, and a boolean done indicating whether the episode has terminated.

#To use this implementation, you'll need to pass in an environment object env and a policy array policy. The policy array should be a 2D numpy array with shape (S, A), where S is the number of states in the environment and A is the number of actions. The value function V is returned as a 1D numpy array with shape (S).