In [None]:
import sys
sys.path.insert(0,'..')

import numpy as np
from itertools import accumulate
from matplotlib import pyplot as plt

from ActionValue.multi_armed_bandit import MultiArmedBandit
from ActionValue.action_value import *

%matplotlib inline

In [None]:
testbed_env_gen = lambda: MultiArmedBandit.random_gen(10, scale = 1)
default_numsteps = 1000
default_numruns = 2000

In [None]:
book_env_gen = lambda: MultiArmedBandit([0.2, -0.9, 1.5, 0.4, 1.2, -1.5, -0.2, -1.1, 0.8, -0.5 ], scale = 1)
plt.violinplot(np.random.randn(200, 10) + book_env_gen().means)
plt.xlabel("Action")
plt.ylabel("Reward distribution")
plt.show()


In [None]:
def single_run(avm, numsteps):
    for i in range(numsteps):
        avm.step()
    cumulative_mean_rewards = [ r/n for r, n in zip(list(accumulate(avm.env.rewards)), range(1,numsteps+1)) ]
    cumulative_best_action_perc = [ r/n for r, n in zip(list(accumulate(avm.env.best_actions)), range(1,numsteps+1)) ]
    return avm.env.rewards.copy(), cumulative_mean_rewards, cumulative_best_action_perc

In [None]:
g1 = MeanValueGreedy(book_env_gen())
g2 = MeanValueEpsilonGreedy(book_env_gen(), epsilon = 0.01)
g3 = MeanValueEpsilonGreedy(book_env_gen(), epsilon = 0.1)
rewards1, mean_rewards1, best_actions_perc1 = single_run(g1, default_numsteps)
rewards2, mean_rewards2, best_actions_perc2 = single_run(g2, default_numsteps)
rewards3, mean_rewards3, best_actions_perc3 = single_run(g3, default_numsteps)
plt.plot(range(default_numsteps), mean_rewards1, label = "greedy")
plt.plot(range(default_numsteps), mean_rewards2, label = "0.01-greedy")
plt.plot(range(default_numsteps), mean_rewards3, label = "0.1-greedy")
plt.title("single run execution on the testbed")
plt.xlabel("iteration step")
plt.ylabel("cumulative mean of rewards")
plt.legend()
plt.show()

In [None]:
def multi_run(avm_gen, numsteps, numruns):
    mean_rewards = np.zeros(numsteps)
    mean_bestactions = np.zeros(numsteps)
    for r in range(numruns):
        avm = avm_gen()
        for i in range(numsteps):
            avm.step()
        mean_rewards += avm.env.rewards
        mean_bestactions += avm.env.best_actions
    mean_rewards /= numruns
    mean_bestactions /= numruns
    return mean_rewards, mean_bestactions

In [None]:
def multi_run_graph(labels, mean_rewards, mean_bestactions, title=""):
    plots = len(labels)
    steps = len(mean_rewards[0])
    plt.figure(figsize=(15,5))
    plt.subplot(121)
    for i in range(plots):
        plt.plot(range(steps), mean_rewards[i], label = labels[i])
    plt.xlabel("iteraton steps")
    plt.ylabel("mean reward")
    plt.title(title)
    plt.legend()
    plt.subplot(122)
    for i in range(plots):
        plt.plot(range(steps), mean_bestactions[i], label = labels[i])
    plt.xlabel("iteraton steps")
    plt.ylabel("% of correct actions")
    plt.title(title)
    plt.legend()

In [None]:
%time _ = multi_run(lambda: MeanValueEpsilonGreedy(testbed_env_gen(), epsilon = 0), default_numsteps, default_numruns)

In [None]:
mean_rewards1, mean_bestactions1 = multi_run(
    lambda: MeanValueEpsilonGreedy(testbed_env_gen(), epsilon = 0), default_numsteps, default_numruns)
mean_rewards2, mean_bestactions2 = multi_run(
    lambda: MeanValueEpsilonGreedy(testbed_env_gen(), epsilon = 0.01), default_numsteps, default_numruns)
mean_rewards3, mean_bestactions3 = multi_run(
    lambda: MeanValueEpsilonGreedy(testbed_env_gen(), epsilon = 0.1), default_numsteps, default_numruns)
multi_run_graph(["greedy", "0.01-greey", "0.1-greedy"],
                [mean_rewards1, mean_rewards2, mean_rewards3],
                [mean_bestactions1, mean_bestactions2, mean_bestactions3],
                title="Mean-value algorithms")
plt.show()

In [None]:
mean_rewards1, mean_bestactions1 = multi_run(
    lambda: ConstantStepEpsilonGreedy(testbed_env_gen(), epsilon = 0.1, alpha = 0.01), default_numsteps, default_numruns)
mean_rewards2, mean_bestactions2 = multi_run(
    lambda: ConstantStepEpsilonGreedy(testbed_env_gen(), epsilon = 0.1, alpha = 0.05), default_numsteps, default_numruns)
mean_rewards3, mean_bestactions3 = multi_run(
    lambda: ConstantStepEpsilonGreedy(testbed_env_gen(), epsilon = 0.1, alpha = 0.1), default_numsteps, default_numruns)
mean_rewards4, mean_bestactions4 = multi_run(
    lambda: MeanValueEpsilonGreedy(testbed_env_gen(), epsilon = 0.1), default_numsteps, default_numruns)
multi_run_graph(["alpha = 0.01", "alpha = 0.05", "alpha = 0.1", "mean-value"],
                [mean_rewards1, mean_rewards2, mean_rewards3, mean_rewards4],
                [mean_bestactions1, mean_bestactions2, mean_bestactions3, mean_bestactions4],
                title = "Constant-step and mean-value 0.1-greedy algorithms")
plt.show()

In [None]:
mean_rewards1, mean_bestactions1 = multi_run(
    lambda: ConstantStepEpsilonGreedy(testbed_env_gen(), epsilon = 0.0, alpha = 0.1, initial_preference = 5 ),
    default_numsteps, default_numruns)
mean_rewards2, mean_bestactions2 = multi_run(
    lambda: ConstantStepEpsilonGreedy(testbed_env_gen(), epsilon = 0.0, alpha = 0.1),
    default_numsteps, default_numruns)
mean_rewards3, mean_bestactions3 = multi_run(
    lambda: ConstantStepEpsilonGreedy(testbed_env_gen(), epsilon = 0.1, alpha = 0.1),
    default_numsteps, default_numruns)
multi_run_graph(["alpha=0.1 greedy Q_init=5", "alpha=0.1 greedy Q_init=0","alpha=0.1 0.1-greedy Q_init=0"],
                [mean_rewards1, mean_rewards2, mean_rewards3],
                [mean_bestactions1, mean_bestactions2, mean_bestactions3],
                title="Algorithms with different initial estimates")
plt.show()

In [None]:
nonstationary_env_gen = lambda: MultiArmedBandit([0.0] * 10, scale = 1, drift = 0.01)

In [None]:
%time _ = multi_run(lambda: MeanValueEpsilonGreedy(nonstationary_env_gen(), epsilon = 0), default_numsteps, default_numruns)

In [None]:
mean_rewards1, mean_bestactions1 = multi_run(
    lambda: MeanValueEpsilonGreedy(nonstationary_env_gen(), epsilon = 0.1), 10000, default_numruns)
mean_rewards2, mean_bestactions2 = multi_run(
    lambda: ConstantStepEpsilonGreedy(nonstationary_env_gen(), epsilon = 0.1, alpha = 0.1), 10000, default_numruns)
multi_run_graph(["mean-value, 0.1-greedy", "alpha = 0.1, 0.1-greedy"],
                [mean_rewards1, mean_rewards2],
                [mean_bestactions1, mean_bestactions2],
                title = "Nonstationary environment")
plt.show()

In [None]:
mean_rewards1, mean_bestactions1 = multi_run(
    lambda: MeanValueUCB(testbed_env_gen(), c = 2), default_numsteps, default_numruns)
mean_rewards2, mean_bestactions2 = multi_run(
    lambda: MeanValueEpsilonGreedy(testbed_env_gen(), epsilon = 0.1), default_numsteps, default_numruns)
multi_run_graph(["nean-value UCB c=2", "mean-value 0.1-greedy"],
                [mean_rewards1, mean_rewards2],
                [mean_bestactions1, mean_bestactions2],
                title="UCB vs epsilon greedy")
plt.show()

In [None]:
testbed_baseline_env_gen = lambda: MultiArmedBandit.random_gen(10, mean_loc=4.0, scale = 1.0)

In [None]:
%time _ = multi_run(lambda: GradientAlgorithm(testbed_baseline_env_gen(), alpha=0.1), default_numsteps, default_numruns)

In [None]:
mean_rewards1, mean_bestactions1 = multi_run(
    lambda: GradientAlgorithm(testbed_baseline_env_gen(), alpha=0.1), default_numsteps, default_numruns)
mean_rewards2, mean_bestactions2 = multi_run(
    lambda: GradientAlgorithm(testbed_baseline_env_gen(), alpha=0.4), default_numsteps, default_numruns)
mean_rewards3, mean_bestactions3 = multi_run(
    lambda: GradientAlgorithm(testbed_baseline_env_gen(), alpha=0.1, baseline=False), default_numsteps, default_numruns)
mean_rewards4, mean_bestactions4 = multi_run(
    lambda: GradientAlgorithm(testbed_baseline_env_gen(), alpha=0.4, baseline=False), default_numsteps, default_numruns)
multi_run_graph(["baseline alpha=0.1", "baseline alpha=0.4","no baseline alpha=0.1", "no baseline alpha=0.4" ],
                [mean_rewards1, mean_rewards2, mean_rewards3, mean_rewards4],
                [mean_bestactions1, mean_bestactions2, mean_bestactions3, mean_bestactions4],
                title="Gradient based algorithm")
plt.show()

In [None]:
mean_rewards1, mean_bestactions1 = multi_run(
    lambda: ConstantStepEpsilonGreedy(testbed_env_gen(), epsilon = 0.1, alpha = 0.1), default_numsteps, default_numruns)
mean_rewards2, mean_bestactions2 = multi_run(
    lambda: ExpWeightNoBiasEpsilonGreedy(testbed_env_gen(), epsilon = 0.1, alpha = 0.1),
    default_numsteps, default_numruns)
mean_rewards3, mean_bestactions3 = multi_run(
    lambda: ConstantStepEpsilonGreedy(testbed_env_gen(), epsilon = 0.1, alpha = 0.01), default_numsteps, default_numruns)
mean_rewards4, mean_bestactions4 = multi_run(
    lambda: ExpWeightNoBiasEpsilonGreedy(testbed_env_gen(), epsilon = 0.1, alpha = 0.01), default_numsteps, default_numruns)
multi_run_graph(["alpha = 0.1", "alpha = 0.1 nobias", "alpha = 0.01", "alpha = 0.01 nobias"],
                [mean_rewards1, mean_rewards2, mean_rewards3, mean_rewards4],
                [mean_bestactions1, mean_bestactions2, mean_bestactions3, mean_bestactions4],
                title = "Constant-step 0.1-greedy algorithms with and without bias")
plt.show()