 Practical-3 : Write a python program to solve the multi-armed bandit problem using the exploration only, greedy algorithm and epsilon greedy
 algorithm. Compare the reward obtained with random sampling.

USING EXPLORATION AND COUNT NUMBER OF TIMES EACH MACHINE IS RUNNING 

In [4]:


import numpy as np

class Machine:
    def __init__(self, true_mean):
        self.true_mean = true_mean
        self.sample_count = 0
        self.sample_mean = 0

    def pull(self):
        reward = np.random.randn() + self.true_mean
        self.sample_count += 1
        self.sample_mean = (1 - 1.0/self.sample_count) * self.sample_mean + (1.0/self.sample_count) * reward
        return reward

def random_machine_choice(machines, num_trials):
    rewards = np.zeros(num_trials)
    machine_counts = [0] * len(machines)  # List to keep track of the number of times each machine is chosen
    
    for trial in range(num_trials):
        chosen_machine = np.random.randint(len(machines))  # Randomly choose a machine
        machine_counts[chosen_machine] += 1  # Increment the count for the chosen machine
        
        # Pull the chosen machine
        reward = machines[chosen_machine].pull()
        rewards[trial] = reward
        
    return rewards, machine_counts

# Number of machines and their true means
num_machines = 5
true_means = [4.0, 3.0, 5.0, 7.0, 2.0]

# Initialize machines
machines = [Machine(true_mean) for true_mean in true_means]

# Set the number of trials
num_trials = 1000

# Run the random machine choice strategy
rewards, machine_counts = random_machine_choice(machines, num_trials)

# Print the estimated means of each machine
for i, m in enumerate(machines):
    print(f"Estimated mean for Machine {i + 1}: {m.sample_mean:.4f}")

# Print the number of times each machine has been run
for i, count in enumerate(machine_counts):
    print(f"Machine {i + 1} has been run {count} times")

# Print the total cumulative reward over all trials
print(f"Total cumulative reward: {np.sum(rewards)}")


Estimated mean for Machine 1: 3.9575
Estimated mean for Machine 2: 3.0728
Estimated mean for Machine 3: 5.0990
Estimated mean for Machine 4: 6.8987
Estimated mean for Machine 5: 2.0217
Machine 1 has been run 213 times
Machine 2 has been run 183 times
Machine 3 has been run 174 times
Machine 4 has been run 216 times
Machine 5 has been run 214 times
Total cumulative reward: 4215.2582860576795


USING GREEDY ALGORITHM

In [7]:
import numpy as np

class Machine:
    def __init__(self, true_mean):
        self.true_mean = true_mean
        self.sample_count = 0
        self.sample_mean = 0

    def pull(self):
       
        reward = np.random.randn() + self.true_mean
        self.sample_count += 1
        self.sample_mean = (1 - 1.0/self.sample_count) * self.sample_mean + (1.0/self.sample_count) * reward
        return reward

def greedy_algorithm(machines, num_trials):
    rewards = np.zeros(num_trials)
    
    for trial in range(num_trials):
        # Choose the machine with the highest estimated mean (greedy choice)
        chosen_machine = np.argmax([m.sample_mean for m in machines])
        
        # Pull the chosen machine
        reward = machines[chosen_machine].pull()
        rewards[trial] = reward
        
    return rewards

# Number of machines and their true means
num_machines = 5
true_means = [4.0, 3.0, 5.0, 7.0, 2.0]

# Initialize machines
machines = [Machine(true_mean) for true_mean in true_means]

# Set the number of trials
num_trials = 1000

# Run the greedy algorithm
rewards = greedy_algorithm(machines, num_trials)

# Print the estimated means of each machine
for i, m in enumerate(machines):
    print(f"Estimated mean for Machine {i + 1}: {m.sample_mean:.4f}")

# Print the total cumulative reward over all trials
print(f"Total cumulative reward: {np.sum(rewards)}")


Estimated mean for Machine 1: 4.0342
Estimated mean for Machine 2: 0.0000
Estimated mean for Machine 3: 0.0000
Estimated mean for Machine 4: 0.0000
Estimated mean for Machine 5: 0.0000
Total cumulative reward: 4034.1830946887626


USING EPSILON-GREEDY ALGORITHM

In [19]:
import numpy as np

class Machine:
    def __init__(self, true_mean):
        self.true_mean = true_mean
        self.sample_count = 0
        self.sample_mean = 0

    def pull(self):
        reward = np.random.randn() + self.true_mean
        self.sample_count += 1
        self.sample_mean = (1 - 1.0/self.sample_count) * self.sample_mean + (1.0/self.sample_count) * reward
        return reward

def epsilon_greedy(machines, epsilon, num_trials):
    rewards = np.zeros(num_trials)
    machine_counts = [0] * len(machines)
    
    for trial in range(num_trials):
        if np.random.rand() < epsilon:
            # Explore: Randomly choose a machine
            chosen_machine = np.random.randint(len(machines))
        else:
            # Exploit: Choose the machine with the highest estimated mean
            chosen_machine = np.argmax([m.sample_mean for m in machines])
            
        machine_counts[chosen_machine] += 1
        reward = machines[chosen_machine].pull()
        rewards[trial] = reward
        
    return rewards, machine_counts

# Number of machines and their true means
num_machines = 5
true_means = [4.0, 3.0, 5.0, 7.0, 2.0]

# Initialize machines
machines = [Machine(true_mean) for true_mean in true_means]

# Set the number of trials and epsilon value
num_trials = 1000
epsilon = 0.1  # You can adjust epsilon as needed

# Run the epsilon-greedy strategy
rewards, machine_counts = epsilon_greedy(machines, epsilon, num_trials)

# Print the estimated means of each machine
for i, m in enumerate(machines):
    print(f"Estimated mean for Machine {i + 1}: {m.sample_mean:.4f}")

# Print the number of times each machine has been run
for i, count in enumerate(machine_counts):
    print(f"Machine {i + 1} has been run {count} times")

# Print the total cumulative reward over all trials
print(f"Total cumulative reward: {np.sum(rewards)}")


Estimated mean for Machine 1: 4.0435
Estimated mean for Machine 2: 2.4800
Estimated mean for Machine 3: 5.1999
Estimated mean for Machine 4: 6.9911
Estimated mean for Machine 5: 1.6865
Machine 1 has been run 66 times
Machine 2 has been run 19 times
Machine 3 has been run 20 times
Machine 4 has been run 870 times
Machine 5 has been run 25 times
Total cumulative reward: 6542.417793370736
