<a href="https://colab.research.google.com/github/astrfo/RS_init/blob/main/RS.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# RS 実装

[満足化を通じた最適な自律的探索] https://www.jstage.jst.go.jp/article/pjsai/JSAI2018/0/JSAI2018_1Z304/_article/-char/ja/

[論文要約] https://colab.research.google.com/drive/199SxVBGDdkuzSzU9pjVP9BfMPzaypLfj#scrollTo=UUSaBpguVsBB




In [3]:
import sys
import numpy as np
import random
import matplotlib.pyplot as plt
%matplotlib inline

In [17]:
class Environment(object):

    def __init__(self, K):
        self._K = K
        self.prob = [np.random.rand() for _ in range(K)]   #一様分布からサンプリング

    def play(self, arm):
        if self.prob[arm] > random.random():
            return 1
        else:
            return 0

In [15]:
class Agent(object):

    def __init__(self, K):
        self._K = K
        self._alpha = None
        self._V = None
        self._n = None
        self._N = None
        self.RS = None
        self.aleph = 1.0


    def initialize(self):
        self._V = np.array([0.5] * self._K)
        self._n = np.array([sys.float_info.min] * self._K)      #RS式でゼロ除算を防ぐため
        self._N = 0
        self.RS = np.zeros(self._K)

    def update(self, arm, reward):
        self._alpha = 1 / (1 + self._n[arm])
        self._V[arm] = (1 - self._alpha) * self._V[arm] + self._alpha * reward
        self._n[arm] += 1
        self._N += 1
        self.RS[arm] = self._n[arm] / self._N * (self._V[arm] - self.aleph)     #RS評価値の更新

    def select_arm(self):
        maxRS = np.amax(self.RS)
        maxIndex = np.where(self.RS == maxRS)
        return random.choice(maxIndex)

In [20]:
class Simulator(object):
    
    def __init__(self, trial, step, K, prob):
        self._agent = Agent(K)
        self._env = Environment(K)
        self._prob = self._env.prob
        self._trial = trial
        self._step = step
        self._regret = np.zeros((trial, step))

    def run(self):
        for t in range(self._trial):
            self._agent.initialize()
            for s in range(self._step):
                arm = self._agent.select_arm()
                reward = self._env.play(arm)
                self._agent.update(arm, reward)
                self.calc_regret(t, s, arm)

    def calc_regret(self, t, s, arm):
        self._regret[t][s] += max(self._prob) - self._prob[arm]
        if s != 0: self._regret[t][s] += self._regret[t][s-1]

In [2]:
K = 20
arm_prob = np.array([np.random.rand() for _ in range(K)])
arm_prob

array([0.01552502, 0.83617327, 0.38771308, 0.52052895, 0.96019493,
       0.47733024, 0.33209924, 0.38663927, 0.70810371, 0.29002406,
       0.82375415, 0.13382304, 0.68687707, 0.42595111, 0.02935369,
       0.38028751, 0.31425619, 0.06047769, 0.28614631, 0.43704415])

In [8]:
sys.float_info.min + 1

1.0