<a href="https://colab.research.google.com/github/astrfo/RS_init/blob/main/RS.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [3]:
import sys
import numpy as np
import random
import matplotlib.pyplot as plt
%matplotlib inline

In [None]:
class Environment(object):

    def __init__(self, K):
        self._k = K
        self._prob = [np.random.rand() for _ in range(K)]

    def play(self, arm):
        if self._prob[arm] > random.random():
            return 1
        else:
            return 0

In [None]:
class Agent(object):

    def __init__(self, len_arm, policy, param):
        self._len_arm = len_arm
        self._policy = Policy(policy, param)
        self._V = None
        self._n = None
        self._eps = sys.float_info.min

    def initialize(self):
        self._V = np.array([0.5] * self._len_arm)
        self._n = np.array([self._eps] * self._len_arm)

    def update(self, arm, reward):
        self._n[arm] += 1
        self._V[arm] += (reward - self._V[arm]) / self._n[arm] # 標本平均手法

    def select_arm(self):
        act = self._policy._act_policy(self._V)
        return act

In [None]:
class Policy(object):
    def __init__(self, policy, param):
        if 'e_greedy' == policy:
            self._epsilon = param
            self._act_policy = self.e_greedy
        if 'random' == policy:
            self._act_policy = self._random

    def e_greedy(self, V):
        if self._epsilon < random.random():
            max_V = max(V)
            maxIndex = np.where(V == max_V)[0] # V値が複数ある場合、ランダムで選択する
            return random.choice(maxIndex) # greedy行動を返す
        else:
            return random.randrange(len(V)) # ランダムで選ばれた行動を返す

    def _random(self, V):
        return random.randrange(len(V))

In [None]:
class Simulator(object):

    def __init__(self, trial, step, len_arm, prob, policy, param):
        self._agent = Agent(len_arm, policy, param)
        self._env = Environment(prob)
        self._prob = prob
        self._trial = trial
        self._step = step
        self._epsilon = param
        self._regret = np.zeros((trial, step))

    
    def run(self):
        for t in range(self._trial):
            self._agent.initialize()
            for s in range(self._step):
                arm = self._agent.select_arm()
                reward = self._env.play(arm)
                self._agent.update(arm, reward)
                self.calc_regret(t, s, arm)
        self.print_regret()

    
    def calc_regret(self, t, s, arm):
        self._regret[t][s] += max(self._prob) - self._prob[arm]
        if s != 0: self._regret[t][s] += self._regret[t][s-1]
    
    def print_regret(self):
        plt.plot(np.arange(self._step), np.mean(self._regret, axis=0), label='epsilon='+str(self._epsilon))
        plt.xlabel('step')
        plt.ylabel('regret')
        plt.legend()
        plt.show()

In [None]:
K = 20

arm_prob = np.array([0.3, 0.5, 0.7]) # 腕の本数、確率はここで変更可
policy = 'e_greedy' # 方策はここで変更可 e_greedyまたはrandom
epsilon = 0.1
trial = 100
step = 1000

sim = Simulator(trial, step, len(arm_prob), arm_prob, policy, param=epsilon)
sim.run()

In [2]:
K = 20
arm_prob = np.array([np.random.rand() for _ in range(K)])
arm_prob

array([0.01552502, 0.83617327, 0.38771308, 0.52052895, 0.96019493,
       0.47733024, 0.33209924, 0.38663927, 0.70810371, 0.29002406,
       0.82375415, 0.13382304, 0.68687707, 0.42595111, 0.02935369,
       0.38028751, 0.31425619, 0.06047769, 0.28614631, 0.43704415])

In [4]:
sys.float_info.min

2.2250738585072014e-308