<a href="https://colab.research.google.com/github/amsaghiri/Multi-armed-bandit/blob/main/MABgame2.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [2]:
import numpy as np
import matplotlib.pyplot as plt

# Bandit class represents an individual bandit problem
class Bandit:
    def __init__(self, n_arms, epsilon, alpha):
        self.n_arms = n_arms  # Number of arms (actions)
        self.epsilon = epsilon  # Exploration rate
        self.alpha = alpha  # Learning rate
        self.q = np.zeros(n_arms)  # Action values for each arm initialized to 0
        self.actions = []  # List to store selected actions
        self.rewards = []  # List to store obtained rewards
        self.optimal = []  # List to store whether selected action was optimal

    # Epsilon-greedy algorithm for action selection
    def epsilon_greedy(self):
        if np.random.random() < self.epsilon:
            return np.random.randint(self.n_arms)  # Random action selection
        else:
            return np.argmax(self.q)  # Greedy action selection based on action values

    # Update action value based on obtained reward
    def update_action_value(self, action, reward):
        self.q[action] += self.alpha * (reward - self.q[action])  # Update action value using incremental update
        return self.q[action]

    # Run the bandit problem for a given number of iterations
    def run(self, num_iterations):
        self.actions = np.zeros(num_iterations)
        self.rewards = np.zeros(num_iterations)
        self.optimal = np.zeros(num_iterations)

        for t in range(num_iterations):
            action = self.epsilon_greedy()
            self.actions[t] = action
            reward = self.rewards[t]
            self.rewards[t] = reward
            optimal_action = np.argmax(self.q)
            self.optimal[t] = action == optimal_action
            self.update_action_value(action, reward)

        results = {
            'actions': self.actions,
            'rewards': self.rewards,
            'optimal': self.optimal
        }

        return results


# Node class represents a node in the graph
class Node:
    def __init__(self, id):
        self.id = id  # Node ID
        self.bandit = None  # Each node has a bandit instance associated with it


# Graph class represents the overall graph structure
class Graph:
    def __init__(self, nodes):
        self.nodes = nodes  # List to store graph nodes

    # Add a node to the graph
    def add_node(self, node):
        self.nodes.append(node)

    # Get all nodes in the graph
    def get_nodes(self):
        return self.nodes


# Environment class generates rewards based on a given rate
class Environment:
    def __init__(self, r):
        self.r = r  # Reward value

    # Get a reward based on the rate of 'Yes'
    def get_reward(self, rate_yes):
        if np.random.random() < rate_yes:
            return self.r
        else:
            return 0


# Setting
num_iteration = 100000  # Number of iterations
r = 2  # Reward value
number_players = 10  # Number of players
rate_yes_list = []  # List to store rate of 'Yes' over iterations

# Set third action as the favored action!
environment = Environment(r)  # Create environment instance
graph = Graph([])  # Create graph instance with no nodes

# Define rate ranges for each player
def probability_players(i):
    switcher = {
        0: [0.4, 0.6],
        1: [0.45, 0.55],
        2: [0.1, 0.9],
        3: [0.3, 0.7],
        4: [0.45, 0.55],
        5: [0.35, 0.65],
        6: [0.35, 0.65],
        7: [0.2, 0.8],
        8: [0.5, 0.5],
        9: [0.6, 0.4]
    }
    return switcher.get(i)

# Create nodes and associated bandit instances
for i in range(number_players):
    node = Node(i + 1)
    bandit_instance = Bandit(2, 0.1, 0.1)
    node.bandit = bandit_instance
    graph.add_node(node)

# Iterate for each number of iterations
for index_num_iteration in range(num_iteration):
    rate_yes=0
    rate_yes_list.append(rate_yes)
    action_selected_list = []
    rate_yes = float(rate_yes/number_players)

    # Iterate over nodes in the graph
    for node in graph.get_nodes():
        favor_action = node.bandit.epsilon_greedy()  # Select action using epsilon-greedy algorithm
        reward = environment.get_reward(rate_yes)  # Get reward from the environment
        node.bandit.rewards.append(reward)  # Add reward to bandit instance
        node.bandit.actions.append(favor_action)  # Add selected action to bandit instance
        node.bandit.update_action_value(favor_action, reward)  # Update action value in bandit instance
        action_selected_list.append(favor_action)

    index_num_iteration += 1
    rate_yes = float(rate_yes/number_players)
    rate_yes_list.append(rate_yes)

# Plotting: Plot favor action for each player
for node in graph.get_nodes():
    plt.plot(range(num_iteration), node.bandit.actions, label="Player {}".format(node.id))

plt.xlabel("Iterations")
plt.ylabel("Favor action")
plt.legend()
plt.show()

# Plotting: Plot rate of 'Yes' over iterations
plt.plot(range(num_iteration), rate_yes_list)
plt.xlabel("Iterations")
plt.ylabel("Rate of Yes")
plt.show()


NameError: ignored