<a href="https://colab.research.google.com/github/astrfo/RS_init/blob/main/RS.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# RS 実装

[満足化を通じた最適な自律的探索] https://www.jstage.jst.go.jp/article/pjsai/JSAI2018/0/JSAI2018_1Z304/_article/-char/ja/

[論文要約] https://colab.research.google.com/drive/199SxVBGDdkuzSzU9pjVP9BfMPzaypLfj#scrollTo=UUSaBpguVsBB




In [3]:
import sys
import numpy as np
import random
import matplotlib.pyplot as plt
%matplotlib inline

In [10]:
class Environment(object):

    def __init__(self, K):
        self._K = K
        self._prob = [np.random.rand() for _ in range(K)]   #一様分布からサンプリング

    def play(self, arm):
        if self._prob[arm] > random.random():
            return 1
        else:
            return 0

In [14]:
class Agent(object):

    def __init__(self, K):
        self._V = None
        self._n = None
        self._alpha = None
        self._N = None
        self._K = K
        self._eps = sys.float_info.min  #RS式でゼロ除算を防ぐため
        self.RS = None
        self.aleph = 1.0


    def initialize(self):
        self._V = np.array([0.5] * self._K)
        self._n = np.array([self._eps] * self._K)
        self._N = 0
        self.RS = np.zeros(self._K)

    def update(self, arm, reward):
        self._alpha = 1 / (1 + self._n[arm])
        self._V[arm] = (1 - self._alpha) * self._V[arm] + self._alpha * reward
        self._n[arm] += 1
        self._N += 1
        self.RS[arm] = self._n[arm] / self._N * (self._V[arm] - self.aleph)

    def select_arm(self):
        max_V = np.amax(self._V)
        maxIndex = np.where(self._V == max_V)
        return random.choice(maxIndex)

In [2]:
K = 20
arm_prob = np.array([np.random.rand() for _ in range(K)])
arm_prob

array([0.01552502, 0.83617327, 0.38771308, 0.52052895, 0.96019493,
       0.47733024, 0.33209924, 0.38663927, 0.70810371, 0.29002406,
       0.82375415, 0.13382304, 0.68687707, 0.42595111, 0.02935369,
       0.38028751, 0.31425619, 0.06047769, 0.28614631, 0.43704415])

In [8]:
sys.float_info.min + 1

1.0