In [1]:
import gym
import gym_bandits
import numpy as np
import pprint
import json
import matplotlib.pyplot as plt
import plotly.graph_objects as go

In [2]:
np.random.seed(0)

In [3]:
all_gyms = {
    # "BanditTwoArmedDeterministicFixed-v0": 2,
    # "BanditTwoArmedHighLowFixed-v0": 2,
    # "BanditTwoArmedHighHighFixed-v0": 2,
    # "BanditTwoArmedLowLowFixed-v0": 2,
    "BanditTenArmedRandomFixed-v0": 10,
    "BanditTenArmedRandomRandom-v0": 10,
    "BanditTenArmedUniformDistributedReward-v0": 10,
    "BanditTenArmedGaussian-v0": 10,
}
RUN_PER_ARM = 50
SIGMA = 0.3
GRADE_TIMES = 3
result_table = {}


In [4]:
class SigmaGreedy:
    def __init__(self, narms, sigma, **kwargs):
        self.knlgs = []
        self.narms = narms
        self.reduce_sigma = kwargs.get("reduce_sigma", False)
        self.max_try = kwargs.get("max_try", False)
        self.times = 0
        self.explore_times = 0

        for _ in range(narms):
            self.knlgs.append({"reward": 0, "times": 0})

        self.sigma = sigma

    def reset(self):
        self.knlgs = []
        self.times = 0
        self.explore_times = 0
        for _ in range(self.narms):
            self.knlgs.append({"reward": 0, "times": 0})

    def step(self):
        self.times += 1
        if self._check_sigma():
            return self._explore()
        else:
            return self._exploit()

    def _explore(self):
        self.explore_times +=1 
        if self.max_try:
            return np.argmin([knlg["times"] for knlg in self.knlgs])
        else:
            return np.random.randint(0, self.narms)

    def _exploit(self):
        # return np.random.randint(0, self.narms)
        return self._argmax()

    def _check_sigma(self):
        rand = np.random.random()
        if self.reduce_sigma:
            return rand < (self.sigma / np.sqrt(self.explore_times + 1))
        else:
            return rand < self.sigma

    def _argmax(self):

        return np.argmax([knlg["reward"] for knlg in self.knlgs])
        # return idx

    def review(self, act, reward):
        old_knlg = self.knlgs[act]
        old_tries = old_knlg["times"]
        old_reward = old_knlg["reward"]

        self.knlgs[act] = {
            "reward": (old_reward * old_tries + reward) / (old_tries + 1),
            "times": old_tries + 1,
        }

    def glance(self):
        print("inspect knlgs:", self.knlgs)
        print("inspect times:", self.times, ", ", "explore times:", self.explore_times)

    def inspect_knlg(self):
        return self.knlgs

    def get_name(self):
        name = "SG"
        if self.max_try:
            name += "_MAX_TRY"
        if self.reduce_sigma:
            name += "_REDUCE_SIGMA"
        return name


# agent = SigmaGreedy(10, 0.3)


In [5]:
class SoftMax:
    def __init__(self, narms, miu):
        self.miu = miu
        self.narms = narms
        self.knlgs = []
        self.times = 0
        self.explore_times = 0

        for _ in range(narms):
            self.knlgs.append({"reward": 0, "times": 0})

    def reset(self):
        self.knlgs = []
        for _ in range(self.narms):
            self.knlgs.append({"reward": 0, "times": 0})

        self.times = 0
        self.explore_times = 0

    def step(self):
        self.times += 1
        probs = [np.exp(knlg["reward"] / self.miu) for knlg in self.knlgs]
        choice = np.random.choice(self.narms, p=probs / np.sum(probs))
        largest = np.argmax([knlg["reward"] for knlg in self.knlgs])
        if choice != largest:
            self.explore_times += 1
        return choice

    def review(self, act, reward):
        old_knlg = self.knlgs[act]
        old_tries = old_knlg["times"]
        old_reward = old_knlg["reward"]

        self.knlgs[act] = {
            "reward": (old_reward * old_tries + reward) / (old_tries + 1),
            "times": old_tries + 1,
        }

    def glance(self):
        print("inspect knlgs:", self.knlgs)
        print("inspect times:", self.times, ", ", "explore times:", self.explore_times)

    def inspect_knlg(self):
        return self.knlgs

    def get_name(self):
        return "SM"


In [6]:
def run(env, agent, narms):
    total_reward = 0
    for _ in range(narms * RUN_PER_ARM):
        act = agent.step()
        obs, rwd, done, info = env.step(act)
        total_reward += rwd
        agent.review(act, rwd)

    return total_reward


In [7]:
for (name, arms) in all_gyms.items():
    env = gym.make(name)
    # env.seed(0)

    print("gym", name, "start")
    print("gym arms:", arms)
    print("gym props:", env.p_dist, env.r_dist)
    if type(env.r_dist).__name__ == "list":
        mean_reward = np.multiply(env.p_dist, [r[0] for r in env.r_dist])
    else:
        mean_reward = np.multiply(env.p_dist, env.r_dist)

    max_reward = arms * RUN_PER_ARM * np.max(mean_reward)
    print("gym total max reward:", arms * RUN_PER_ARM * np.max(mean_reward))
    result_table[name] = {}

    def format_knlgs(lknlgs):
        lng = len(lknlgs[0])
        rlt = []
        for i in range(lng):
            rlt.append(
                {
                    "reward": np.mean([knlg[i]["reward"] for knlg in lknlgs]),
                    "times": np.mean([knlg[i]["times"] for knlg in lknlgs]),
                }
            )
        return rlt

    agents = [
        SigmaGreedy(arms, SIGMA),
        SigmaGreedy(arms, SIGMA, reduce_sigma=True),
        SigmaGreedy(arms, SIGMA, max_try=True),
        SigmaGreedy(arms, SIGMA, reduce_sigma=True, max_try=True),
        SoftMax(arms, 0.25),
    ]

    for agent in agents:

        total_rewards = []
        total_knlgs = []

        print("agent name is:", agent.get_name())

        for _ in range(GRADE_TIMES):
            env.reset()
            env.seed(0)
            agent.reset()
            # agent = SigmaGreedy(arms, SIGMA)
            total_reward = run(env, agent, arms)
            # print("totoal reward:", total_reward)
            total_rewards.append(total_reward)
            agent.glance()
            total_knlgs.append(agent.inspect_knlg())
            # del agent

        # print('final agent state:', agent.glance())
        print("total rewards:", total_rewards)
        grade = np.mean(total_rewards) / max_reward
        print("grade of agent:", grade)

        result_table[name][agent.get_name()] = {
            "grade": grade,
            "knlgs": format_knlgs(total_knlgs),
        }
        env.close()


gym BanditTenArmedRandomFixed-v0 start
gym arms: 10
gym props: [0.5488135  0.71518937 0.60276338 0.54488318 0.4236548  0.64589411
 0.43758721 0.891773   0.96366276 0.38344152] [1 1 1 1 1 1 1 1 1 1]
gym total max reward: 481.83138025051466
agent name is: SG
inspect knlgs: [{'reward': 0.6071428571428571, 'times': 28}, {'reward': 0.6842105263157895, 'times': 19}, {'reward': 0.6842105263157895, 'times': 19}, {'reward': 0.5294117647058824, 'times': 17}, {'reward': 0.3, 'times': 10}, {'reward': 0.6470588235294118, 'times': 17}, {'reward': 0.4166666666666667, 'times': 12}, {'reward': 0.8888888888888888, 'times': 18}, {'reward': 0.9680232558139535, 'times': 344}, {'reward': 0.5625, 'times': 16}]
inspect times: 500 ,  explore times: 136
inspect knlgs: [{'reward': 0.5, 'times': 18}, {'reward': 0.65, 'times': 20}, {'reward': 0.7931034482758621, 'times': 29}, {'reward': 0.625, 'times': 16}, {'reward': 0.4, 'times': 10}, {'reward': 0.4375, 'times': 16}, {'reward': 0.4, 'times': 15}, {'reward': 0.88

In [8]:
print(json.dumps(result_table, indent=4))


{
    "BanditTenArmedRandomFixed-v0": {
        "SG": {
            "grade": 0.875133260202839,
            "knlgs": [
                {
                    "reward": 0.5841013824884792,
                    "times": 25.666666666666668
                },
                {
                    "reward": 0.6755060728744939,
                    "times": 17.333333333333332
                },
                {
                    "reward": 0.7257713248638838,
                    "times": 22.666666666666668
                },
                {
                    "reward": 0.6155731523378581,
                    "times": 15.333333333333334
                },
                {
                    "reward": 0.4507246376811594,
                    "times": 14.333333333333334
                },
                {
                    "reward": 0.5096677559912853,
                    "times": 14.0
                },
                {
                    "reward": 0.48055555555555557,
                

In [9]:
rows = [("".join([char for char in key if char.isupper()])) for key in all_gyms.keys()]
# agent_names = [ result_table.values()]
agent_names = list(list(result_table.items())[0][1].keys())
fig = go.Figure(
    data=[
        # go.Bar(name="SF Zoo", x=rows, y=[20, 14, 23]),
        # go.Bar(name="LA Zoo", x=rows, y=[12, 18, 29]),
        go.Bar(
            name=agent_name,
            x=rows,
            y=[rlt[agent_name]["grade"] for rlt in result_table.values()],
        )
        for agent_name in agent_names
    ]
)
# Change the bar mode
fig.update_layout(barmode="group")
# fig.show()
fig


In [10]:
# fig, ax = plt.subplots()
# ax.set_ylabel("score")
# ax.set_xlabel("bandit type")
# ax.set_ylim(0, 1.05)
# rows = [("".join([char for char in key if char.isupper()])) for key in all_gyms.keys()]

# agent_names = list(list(result_table.items())[0][1].keys())
# print(agent_names)
# for i, agent_name in enumerate(agent_names):

#     scores = [rlt[agent_name]["grade"] for rlt in result_table.values()]
#     for idx, score in enumerate(scores):
#         ax.text(idx, score * 1.02, str(round(score, 2)), ha="center")
#     ax.bar(
#         [row + (i - 1) * 0.2 for row in rows],
#         scores,
#         label=agent_name,
#     )

# ax.legend()
